python source code of test_loop_blocking

""" $lic$
Copyright (C) 2016-2020 by Tsinghua University and The Board of Trustees of
Stanford University

This program is free software: you can redistribute it and/or modify it under
the terms of the Modified BSD-3 License as published by the Open Source
Initiative.

This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the BSD-3 License for more details.

You should have received a copy of the Modified BSD-3 License along with this
program. If not, see <https://opensource.org/licenses/BSD-3-Clause>.
"""

import itertools
import math
import unittest

from nn_dataflow.core import partition
from nn_dataflow.core import BufShrScheme
from nn_dataflow.core import ConvLayer, PoolingLayer
from nn_dataflow.core import Cost
from nn_dataflow.core import DataDimLoops
from nn_dataflow.core import DataCategoryEnum as de
from nn_dataflow.core import LoopBlockingScheme
from nn_dataflow.core import LoopEnum as le
from nn_dataflow.core import MapStrategyEyeriss
from nn_dataflow.core import MemHierEnum as me
from nn_dataflow.core import NestedLoopDesc
from nn_dataflow.core import NodeRegion
from nn_dataflow.core import Option
from nn_dataflow.core import ParallelEnum as pe
from nn_dataflow.core import PartitionScheme
from nn_dataflow.core import PhyDim2
from nn_dataflow.core import Resource
from nn_dataflow.core import SchedulingConstraint
from nn_dataflow import util

class TestLoopBlockingFixture(unittest.TestCase):
    ''' Base fixture class for LoopBlocking tests. '''
    # pylint: disable=too-many-instance-attributes

    def setUp(self):

        # Workload.
        self.layer = {}
        self.layer['BASE'] = ConvLayer(12, 10, 28, 3)
        self.layer['LGFIL'] = ConvLayer(2, 4, 28, 20)
        self.layer['POOL'] = PoolingLayer(32, 28, 2)
        self.layer['PAR'] = ConvLayer(24, 36, 56, 3)
        self.batch_size = 4

        # Resource.
        self.resource = {}
        dim_array = PhyDim2(16, 16)
        proc_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1),
                                 type=NodeRegion.PROC)
        data_region = NodeRegion(origin=PhyDim2(0, 0), dim=PhyDim2(1, 1),
                                 type=NodeRegion.DRAM)
        # Typical resource.
        self.resource['BASE'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=65536, size_regf=64,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        # Larger resource with sufficient capacity, to make all schemes valid.
        self.resource['LG'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=1024 ** 3, size_regf=1024 ** 3,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        # Small resource.
        self.resource['SM'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=4096, size_regf=16,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        # Multi-node parallel resource.
        self.resource['PAR'] = Resource(
            proc_region=NodeRegion(origin=PhyDim2(0, 0),
                                   dim=PhyDim2(4, 2),
                                   type=NodeRegion.PROC),
            dram_region=data_region,
            src_data_region=data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=25000, size_regf=64,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        # Resource with no data regions.
        proc_data_region = NodeRegion(origin=PhyDim2(1, 1), dim=PhyDim2(1, 1),
                                      type=NodeRegion.PROC)
        self.resource['SRCNOTDATA'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=proc_data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=1024 ** 3, size_regf=1024 ** 3,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        self.resource['DSTNOTDATA'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=data_region, dst_data_region=proc_data_region,
            dim_array=dim_array, size_gbuf=1024 ** 3, size_regf=1024 ** 3,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        self.resource['DATALOCAL'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=proc_region, dst_data_region=proc_region,
            dim_array=dim_array, size_gbuf=1024 ** 3, size_regf=1024 ** 3,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=False)
        # Filter pinning.
        self.resource['FILPIN'] = Resource(
            proc_region=proc_region, dram_region=data_region,
            src_data_region=data_region, dst_data_region=data_region,
            dim_array=dim_array, size_gbuf=1024 ** 3, size_regf=1024 ** 3,
            array_bus_width=float('inf'), dram_bandwidth=float('inf'),
            no_time_mux=True)

        # Nested loop description after mapping.
        self.nld = {}
        self.nld['BASE'] = next(MapStrategyEyeriss(self.layer['BASE'],
                                                   self.batch_size, 1,
                                                   dim_array)
                                .gen_nested_loop_desc())
        self.nld['LGFIL'] = next(MapStrategyEyeriss(self.layer['LGFIL'],
                                                    self.batch_size, 1,
                                                    dim_array)
                                 .gen_nested_loop_desc())
        self.nld['POOL'] = next(MapStrategyEyeriss(self.layer['POOL'],
                                                   self.batch_size, 1,
                                                   dim_array)
                                .gen_nested_loop_desc())
        # Fake nested loop, with zero filter size.
        self.nld['ZERO_FIL'] = NestedLoopDesc(loopcnt=(12, 10, 4),
                                              usize_gbuf=(0, 1000, 800),
                                              usize_regf=(0, 3, 1),
                                              unit_access=((0, 1000, 800),
                                                           (0, 1000, 800),
                                                           (3, 9, 7),
                                                           (1, 1, 1)),
                                              data_loops=(DataDimLoops(le.IFM,
                                                                       le.OFM),
                                                          DataDimLoops(le.IFM,
                                                                       le.BAT),
                                                          DataDimLoops(le.OFM,
                                                                       le.BAT)),
                                              unit_ops=1, unit_time=1)
        # Fake nested loop, with zero ifmap size.
        self.nld['ZERO_IFM'] = NestedLoopDesc(loopcnt=(12, 10, 4),
                                              usize_gbuf=(9, 0, 800),
                                              usize_regf=(3, 0, 1),
                                              unit_access=((9, 0, 800),
                                                           (9, 0, 800),
                                                           (3, 9, 7),
                                                           (1, 1, 1)),
                                              data_loops=(DataDimLoops(le.IFM,
                                                                       le.OFM),
                                                          DataDimLoops(le.IFM,
                                                                       le.BAT),
                                                          DataDimLoops(le.OFM,
                                                                       le.BAT)),
                                              unit_ops=1, unit_time=1)

        # Fake partition scheme.
        self.part = PartitionScheme(range(pe.NUM), ((1, 1),) * pe.NUM)

        # Fake buffer sharing scheme.
        self.bufshr = BufShrScheme(proc_region, self.part)

        # Options.
        self.options = {}
        # Basic.
        self.options['BASE'] = Option(ntops=2 ** 30)
        # Multiprocessing.
        self.options['MP'] = Option(ntops=2 ** 30, nprocesses=8)
        # Limited top schemes.
        self.options['NTOPS'] = Option(ntops=10)
        # Bypass.
        self.options['BYP'] = Option(sw_gbuf_bypass=(True,) * 3, ntops=2 ** 30)
        # Bypass solver.
        self.options['BYPSOL'] = Option(sw_gbuf_bypass=(True,) * 3,
                                        sw_solve_loopblocking=True,
                                        ntops=2 ** 30)
        # Access forwarding.
        self.options['ACCFWD'] = Option(hw_access_forwarding=True,
                                        ntops=2 ** 30)
        # Buffer sharing.
        self.options['BUFSHR'] = Option(hw_gbuf_sharing=True,
                                        ntops=2 ** 30)
        # Buffer sharing with bypassing.
        self.options['BUFSHR-BYP'] = Option(sw_gbuf_bypass=(True,) * 3,
                                            hw_gbuf_sharing=True,
                                            ntops=2 ** 30)

        # Constraint.
        self.none_cstr = SchedulingConstraint()
        self.cstr = SchedulingConstraint(topifm=1, topbat=1)

        # Cost.
        self.cost = Cost(mac_op=1, mem_hier=(200, 6, 2, 1),
                         noc_hop=50, idl_unit=50)


    def _lbs(self, bl_ts, bl_ords=None, wlkey='BASE', rsrckey='BASE',
             optkey='BASE'):
        ''' Make a LoopBlockingScheme instance. '''
        bl_ords = (tuple(range(le.NUM)), tuple(range(le.NUM))) \
                if not bl_ords else bl_ords
        return LoopBlockingScheme(self.nld[wlkey], bl_ts, bl_ords,
                                  self.resource[rsrckey], self.bufshr,
                                  self.options[optkey])

    def _gen_loopblocking_all(self, wlkey='BASE'):
        ''' Generate all combinations of loop blocking factors and orders. '''
        for ti, to, tb, orders in itertools.product(
                util.factorize(self.nld[wlkey].loopcnt[le.IFM], 3),
                util.factorize(self.nld[wlkey].loopcnt[le.OFM], 3),
                util.factorize(self.nld[wlkey].loopcnt[le.BAT], 3),
                itertools.product(
                    itertools.permutations(range(le.NUM)),
                    itertools.permutations(range(le.NUM)))):
            lp_ts = [None] * le.NUM
            lp_ts[le.IFM] = ti
            lp_ts[le.OFM] = to
            lp_ts[le.BAT] = tb
            yield tuple(zip(*lp_ts)), orders

    def _make_bl_ts(self, ti_part, to_part, tb_part, wlkey='BASE'):
        '''
        Make a set of blocking factors. `ti_part`, `to_part`, `tb_part` can
        contain one 0 value to be filled.
        '''
        try:
            idx = ti_part.index(0)
        except ValueError:
            ti = ti_part
        else:
            ti = [ti_part[x] if x != idx
                  else util.idivc(self.nld[wlkey].loopcnt[le.IFM],
                                  util.prod(ti_part[:idx] + ti_part[idx+1:]))
                  for x in range(3)]
        try:
            idx = to_part.index(0)
        except ValueError:
            to = to_part
        else:
            to = [to_part[x] if x != idx
                  else util.idivc(self.nld[wlkey].loopcnt[le.OFM],
                                  util.prod(to_part[:idx] + to_part[idx+1:]))
                  for x in range(3)]
        try:
            idx = tb_part.index(0)
        except ValueError:
            tb = tb_part
        else:
            tb = [tb_part[x] if x != idx
                  else util.idivc(self.nld[wlkey].loopcnt[le.BAT],
                                  util.prod(tb_part[:idx] + tb_part[idx+1:]))
                  for x in range(3)]
        lp_ts = [None] * le.NUM
        lp_ts[le.IFM] = ti
        lp_ts[le.OFM] = to
        lp_ts[le.BAT] = tb
        return tuple(zip(*lp_ts))

    def _part_nld(self, part, layerkey='PAR'):
        ''' Make a partitioned NestedLoopDesc and its partition occupation. '''
        p_layer, p_batch_size, p_occ = part.part_layer(self.layer[layerkey],
                                                       self.batch_size)
        p_nld = next(MapStrategyEyeriss(p_layer, p_batch_size, p_occ,
                                        self.resource['PAR'].dim_array)
                     .gen_nested_loop_desc())
        return p_nld

    def _gen_all_partition(self, layerkey='PAR'):
        '''
        Generate PartitionScheme.
        '''
        options = Option(partition_hybrid=True,
                         partition_batch=True,
                         partition_ifmaps=True,
                         ntops=2 ** 30)

        for part in partition.gen_partition(
                self.layer[layerkey], self.batch_size,
                self.resource['PAR'].proc_region.dim, options):
            yield part

    def _total_part_size(self, part, layerkey='PAR'):
        ''' Get the total partitioned data size. '''
        layer = self.layer[layerkey]

        nifm = util.idivc(layer.nifm, part.size(pe.INPP)) * part.size(pe.INPP)
        nofm = util.idivc(layer.nofm, part.size(pe.OUTP)) * part.size(pe.OUTP)
        hofm = util.idivc(layer.hofm, part.dim(pe.OFMP).h) * part.dim(pe.OFMP).h
        wofm = util.idivc(layer.wofm, part.dim(pe.OFMP).w) * part.dim(pe.OFMP).w
        batch_size = util.idivc(self.batch_size, part.size(pe.BATP)) \
                * part.size(pe.BATP)

        full_layer = ConvLayer(nifm, nofm, (hofm, wofm),
                               (layer.hfil, layer.wfil),
                               (layer.htrd, layer.wtrd))
        filter_size = full_layer.total_filter_size()
        ifmap_size = full_layer.total_ifmap_size(batch_size)
        ofmap_size = full_layer.total_ofmap_size(batch_size)

        self.assertGreaterEqual(filter_size, layer.total_filter_size())
        self.assertLess(filter_size, layer.total_filter_size() * 1.2 * 1.2)
        self.assertGreaterEqual(ofmap_size,
                                layer.total_ofmap_size(self.batch_size))
        self.assertLess(ofmap_size,
                        layer.total_ofmap_size(self.batch_size)
                        * 1.2 * 1.2 * 1.2)
        self.assertGreaterEqual(ifmap_size,
                                layer.total_ifmap_size(self.batch_size))

        return filter_size, ifmap_size, ofmap_size

    def _bufshr_params(self, lbs):
        '''
        Get buffer sharing parameters.

        Return subgroup sizes, rotation unit counts.

        Finally, a list of ordered loops as a tuple of LoopEnum and blocking
        factor ordered from outermost to innermost excluding trivial loops.
        '''
        # GBUF level.
        blp1 = lbs.BL.GBUF + 1
        t_x = lbs.bl_ts[blp1]
        ord_x = lbs.bl_ords[blp1]
        # BS level.
        t_bs = lbs.bufshr_bs_t
        ord_bs = lbs.bufshr_bs_ord

        self.assertTrue(all(x % b == 0 for x, b in zip(t_x, t_bs)))

        subgrp_size = lbs.bufshr_subgrp_size
        rot_unit_cnt = lbs.bufshr_rot_unit_cnt

        # Loops as a tuple of LoopEnum and blocking factor, ordered from
        # outermost to innermost, excluding trivial loops.
        lp_t_list = sorted([(lpe, t_bs[lpe])
                            for lpe in range(le.NUM) if t_bs[lpe] > 1],
                           key=lambda tpl: ord_bs[tpl[0]],
                           reverse=True) \
                  + sorted([(lpe, t_x[lpe] // t_bs[lpe])
                            for lpe in range(le.NUM) if t_x[lpe] > t_bs[lpe]],
                           key=lambda tpl: ord_x[tpl[0]],
                           reverse=True)

        return subgrp_size, rot_unit_cnt, lp_t_list


    class _SimBuffer():
        ''' A data buffer model for simulation. '''

        def __init__(self, dce, buf_cnt_pr, unit_size, bypass=False):

            self.dce = dce
            self.bypass = bypass

            # Accesses to this level, in unit counts (* unit size).
            self.access = 0

            # The size of one unit.
            self.unit_size = unit_size

            if self.bypass:
                return

            # The buffered data range, in the form of the range index, of all
            # dimensions. E.g., (ri0, ri1).
            self.data = (float('nan'), float('nan'))

            # The count of buffered units, aka, range size, of all dimensions.
            # E.g., (c0, c1).
            self.buf_cnt_pr = buf_cnt_pr

            # Range index cache.
            self.ridx_pr_cache = {}

        def access_size(self):
            ''' Get access size. '''
            return self.access * self.unit_size

        def do_access(self, idx_pr, cnt_pr, read=1, write=0):
            '''
            Access the buffer by `read` and/or `write`, with the unit index
            `idx_pr` and count `cnt_pr`, of all dimensions.

            Return the count of the accessing data to the next level, of all
            dimensions.
            '''
            if self.bypass:
                # Bypass, relay to the next level.
                return cnt_pr

            # Range index.
            ridx_pr = self._range_idx_pr(idx_pr)

            # Access.
            self.access += util.prod(cnt_pr) * (read + write)

            if ridx_pr == self.data:
                # Hit.
                return (0, 0)

            # Miss.
            self.data = ridx_pr
            return self.buf_cnt_pr

        def _range_idx_pr(self, idx_pr):
            ''' Get the range index of all dimensions. '''
            ridx_pr = self.ridx_pr_cache.get(idx_pr, None)
            if ridx_pr is None:
                ridx_pr = tuple(idx // buf_cnt for idx, buf_cnt
                                in zip(idx_pr, self.buf_cnt_pr))
                self.ridx_pr_cache[idx_pr] = ridx_pr
            return ridx_pr

    class _SimBufferSharing(_SimBuffer):
        ''' A data buffer model with buffer sharing. '''

        def __init__(self, dce, buf_cnt_pr, unit_size,
                     subgrp_size, rot_unit_cnt, lp_t_list, dim_loops,
                     bypass=False):

            # pylint: disable=protected-access
            self.base = super(TestLoopBlockingFixture._SimBufferSharing, self)

            self.base.__init__(dce, buf_cnt_pr, unit_size, bypass=bypass)

            # Number of rotation steps, of each range.
            self.rot_step_cnt = {}
            # Rotation accesses, in unit counts (* unit size).
            self.rot_access = 0
            # Wide fetch accesses, in unit counts (* unit size).
            self.wf_access = 0

            # Rotation rounds per load of a range. If only rotate a single
            # round per data load, the rotation is unnecessary.
            self.rot_rnd_cnt_per_load = None

            if self.bypass:
                return

            # Subrange.
            # A list in the accessing order of subrange indexes, i.e., the
            # ranges of the next level; and the unit counts in one subrange.
            self.subrng_list, self.subrng_cnt_pr = \
                    self._init_sub_range(lp_t_list, dim_loops)
            # Subrange index to the position in the list.
            self.subrng_idx_dict = \
                    dict((sr, i) for i, sr in enumerate(self.subrng_list))
            # Number of subranges.
            self.subrng_num = len(self.subrng_list)

            # Local buffer.
            self.buf_num = subgrp_size
            # Number of subranges in each buffer.
            self.buf_subrng_num = 1. * self.subrng_num / self.buf_num

            # The location centroid of each subrange, i.e., buffer index
            # weighted by fraction.
            self.buf_subrng_centroid = []
            cur_buf_cap = self.buf_subrng_num
            cur_buf_idx = 0
            for _ in range(self.subrng_num):
                centroid = 0
                rem_frac = 1.
                while rem_frac > 0.:
                    if cur_buf_cap >= rem_frac:
                        # Fits in the current buffer.
                        centroid += cur_buf_idx * rem_frac
                        cur_buf_cap -= rem_frac
                        rem_frac = 0.
                        break
                    # Partially fits.
                    centroid += cur_buf_idx * cur_buf_cap
                    rem_frac -= cur_buf_cap
                    cur_buf_cap = self.buf_subrng_num
                    cur_buf_idx += 1
                self.buf_subrng_centroid.append(centroid)

            # Rotation unit.
            # Rotation step happens when moving to the new rotation unit.
            assert self.subrng_num % rot_unit_cnt == 0
            self.rot_unit_size = self.subrng_num // rot_unit_cnt
            # Steps per rotation round.
            self.rot_steps_per_round = 1
            while (self.rot_steps_per_round * self.rot_unit_size
                   + self.buf_subrng_num < self.subrng_num
                   and (self.rot_steps_per_round + 1) * self.rot_unit_size
                   < self.subrng_num):
                self.rot_steps_per_round += 1

            # The rotation unit currently worked on.
            self.cur_rot_unit = 0
            # Rotation steps of the current load of the current range.
            self.cur_rot_step_cnt = 0

            # Last wide fetch subrange index.
            self.last_wf_subrng_idx = 0
            # Amount of sequential wide fetch, can be combined with rotation.
            self.seq_wf_acc = 0
            # Total saved (combined with rotation) wide fetch access.
            self.saved_wf_access = 0

            # Subrange index cache.
            self.sridx_pr_cache = {}

        def rotation_rounds(self):
            ''' Get number of rotation rounds. '''

            # Ensure all ranges have the same rotation steps.
            steps_list = tuple(self.rot_step_cnt.values())
            if not steps_list:
                return 0
            assert all(s == steps_list[0] for s in steps_list)
            steps = steps_list[0]
            if steps == 0:
                return 0

            assert steps % self.rot_steps_per_round == 0

            if self.rot_rnd_cnt_per_load == 1:
                return 0
            return steps // self.rot_steps_per_round

        def rotation_access_size(self):
            ''' Get total rotation access size. '''
            if self.rot_rnd_cnt_per_load == 1:
                return 0
            return self.rot_access * self.unit_size

        def wide_fetch_access_size(self):
            ''' Get total wide fetch access size. '''
            if self.rot_rnd_cnt_per_load == 1:
                return (self.wf_access + self.saved_wf_access) * self.unit_size
            return self.wf_access * self.unit_size

        def do_access(self, idx_pr, cnt_pr, read=1, write=0):

            ret = self.base.do_access(idx_pr, cnt_pr, read=read, write=write)

            if self.bypass:
                # Bypass, skip buffer sharing.
                return ret

            # Range index.
            ridx_pr = self._range_idx_pr(idx_pr)

            if any(ret):
                # Miss in the shared buffer and load new range. Reset.
                self.cur_rot_unit = 0
                self.rot_step_cnt.setdefault(ridx_pr, 0)

                if self.cur_rot_step_cnt == 0:
                    # Initial fetch, no replaced data yet.
                    assert self.rot_rnd_cnt_per_load is None
                else:
                    rot_rnd_cnt_per_load, rem_ = divmod(
                        self.cur_rot_step_cnt, self.rot_steps_per_round)
                    assert rem_ == 0
                    assert self.rot_rnd_cnt_per_load is None \
                            or self.rot_rnd_cnt_per_load == rot_rnd_cnt_per_load
                    self.rot_rnd_cnt_per_load = rot_rnd_cnt_per_load
                self.cur_rot_step_cnt = 0

            assert all(cnt <= subrng_cnt for cnt, subrng_cnt
                       in zip(cnt_pr, self.subrng_cnt_pr))

            # Subrange index.
            sridx_pr = self._subrange_idx_pr(idx_pr)

            # Rotation unit index.
            ru_idx = self._subrng_rot_unit_idx(sridx_pr)

            if ru_idx != self.cur_rot_unit:
                # Move to next rotation unit.

                if (self.cur_rot_unit + 1) * self.rot_unit_size \
                        >= self.subrng_num:
                    # The current rotation unit is the last one. Start a new
                    # rotation round.
                    # Do not rotate back to the initial state. Instead start
                    # from the current state.
                    self.cur_rot_unit = 0

                    self.last_wf_subrng_idx = 0
                    self.seq_wf_acc = 0

                elif self.cur_rot_unit * self.rot_unit_size \
                        + self.buf_subrng_num >= self.subrng_num:
                    # The last rotation unit is already local. No more rotation.
                    self.cur_rot_unit += 1

                else:
                    # Rotate by one rotation unit, but not exceeding the end.
                    offset = min(self.rot_unit_size,
                                 self.subrng_num
                                 - self.cur_rot_unit * self.rot_unit_size
                                 - self.buf_subrng_num)
                    assert offset > 0

                    # All subranges shift by the above offset.
                    acc_ = (1. * offset / self.buf_subrng_num) * self.subrng_num
                    self.rot_access += util.prod(self.subrng_cnt_pr) * acc_
                    self.cur_rot_unit += 1

                    # One rotation step.
                    self.rot_step_cnt[ridx_pr] += 1
                    self.cur_rot_step_cnt += 1

                    # Combine wide fetch with rotation.
                    self.wf_access -= self.seq_wf_acc
                    self.saved_wf_access += self.seq_wf_acc
                    self.seq_wf_acc = 0

                assert ru_idx == self.cur_rot_unit

            # Buffer index of which has this subrange.
            buf_idx = self._subrng_buf_idx(sridx_pr)

            # Wide fetch from possibly remote buffer.
            wf_acc = util.prod(cnt_pr) * (read + write) * buf_idx
            self.wf_access += wf_acc

            # Record amount of sequential wide fetch.
            subrng_idx = self.subrng_idx_dict[sridx_pr]
            if subrng_idx >= self.last_wf_subrng_idx:
                self.seq_wf_acc += wf_acc
            else:
                self.seq_wf_acc = wf_acc
            self.last_wf_subrng_idx = subrng_idx

            return ret

        def _subrange_idx_pr(self, idx_pr):
            ''' Get the subrange index of all dimensions. '''
            sridx_pr = self.sridx_pr_cache.get(idx_pr, None)
            if sridx_pr is None:
                sridx_pr = tuple((idx % buf_cnt) // subrng_cnt
                                 for idx, buf_cnt, subrng_cnt
                                 in zip(idx_pr, self.buf_cnt_pr,
                                        self.subrng_cnt_pr))
                self.sridx_pr_cache[idx_pr] = sridx_pr
            return sridx_pr

        def _subrng_rot_unit_idx(self, sridx_pr):
            ''' Get the rotation unit index of the subrange. '''
            return self.subrng_idx_dict[sridx_pr] // self.rot_unit_size

        def _subrng_buf_idx(self, sridx_pr):
            ''' Get the buffer index of which currently has the subrange. '''
            subrng_idx = self.subrng_idx_dict[sridx_pr]

            # Start from the current rotation unit.
            subrng_idx -= self.cur_rot_unit * self.rot_unit_size
            subrng_idx %= self.subrng_num

            return self.buf_subrng_centroid[subrng_idx]

        def _init_sub_range(self, lp_t_list, dim_loops):

            assert len(dim_loops) == 2

            subrng_list = [(0, 0)]
            subrng_sz_pr = [1, 1]

            # From inner to outer.
            for lpe, t in reversed(lp_t_list):
                # The data dimension index of this loop.
                try:
                    d = dim_loops.index(lpe)
                except ValueError:
                    # This loop is not related to the data, skip.
                    assert lpe not in dim_loops
                    continue

                # Size of this dimension of current loop body, i.e., all inner
                # loops.
                s = subrng_sz_pr[d]

                # Make the new subrange list, by looping over the current loop
                # body with the current loop factor, and updating this
                # dimension.
                new_subrng_list = []
                for i in range(t):
                    new_subrng_list += [tuple(i_ + i * s if d_ == d else i_
                                              for d_, i_ in enumerate(sr))
                                        for sr in subrng_list]
                subrng_list = new_subrng_list

                # Update size of this dimension.
                subrng_sz_pr[d] *= t

                # Check.
                assert len(set(subrng_list)) == len(subrng_list)
                assert len(subrng_list) == util.prod(subrng_sz_pr)

            subrng_cnt_pr = tuple(buf_cnt // subrng_sz for buf_cnt, subrng_sz
                                  in zip(self.buf_cnt_pr, subrng_sz_pr))

            return subrng_list, subrng_cnt_pr

    def _sim_access_conv(self, lbs, get_bufshr=False):
        '''
        Get data access by actually simulating and generating loops for CONV
        layer.

        If `get_bufshr` is True, also return bufshr stats.
        '''
        self.assertTrue(lbs.is_valid(), '_sim_access_conv: invalid lbs.')

        data_loops = lbs.nld.data_loops

        lpts = tuple(zip(*lbs.bl_ts))

        subgrp_size, rot_unit_cnt, lp_t_list = self._bufshr_params(lbs)
        data_loops = lbs.nld.data_loops

        # Get buffered unit counts at each level.
        dram_buf_cnt_pr_list = [tuple(util.prod(lpts[lpe])
                                      for lpe in data_loops[dce].loops())
                                for dce in range(de.NUM)]
        gbuf_buf_cnt_pr_list = [tuple(util.prod(lpts[lpe][1:])
                                      for lpe in data_loops[dce].loops())
                                for dce in range(de.NUM)]
        regf_buf_cnt_pr_list = [tuple(util.prod(lpts[lpe][2:])
                                      for lpe in data_loops[dce].loops())
                                for dce in range(de.NUM)]

        # Initialize SimBuffer.
        drams = [None] * de.NUM
        for dce, buf_cnt_pr in enumerate(dram_buf_cnt_pr_list):
            drams[dce] = self._SimBuffer(dce, buf_cnt_pr,
                                         lbs.nld.unit_access[me.DRAM][dce]
                                         if lbs.stored_in_gbuf[dce]
                                         else lbs.nld.unit_access[me.GBUF][dce],
                                        )
        gbufs = [None] * de.NUM
        for dce, buf_cnt_pr in enumerate(gbuf_buf_cnt_pr_list):
            gbufs[dce] = self._SimBufferSharing(
                dce, buf_cnt_pr, lbs.nld.unit_access[me.GBUF][dce],
                subgrp_size[dce], rot_unit_cnt[dce], lp_t_list,
                data_loops[dce].loops(),
                bypass=(not lbs.stored_in_gbuf[dce]))
        regfs = [None] * de.NUM
        for dce, buf_cnt_pr in enumerate(regf_buf_cnt_pr_list):
            regfs[dce] = self._SimBuffer(dce, buf_cnt_pr,
                                         lbs.nld.unit_access[me.REGF][dce],
                                        )

        # Already generated psum for OFM.
        ofm_psum = set()

        # Simulation.
        for idx_tuple in lbs.gen_index():

            for dce in range(de.NUM):

                idx_pr = tuple(data_loops[dce].take(idx_tuple))

                if dce == de.OFM:
                    # Fetch and writeback, unless for the first time (no fetch).
                    write = 1
                    read = 1 if idx_pr in ofm_psum else 0
                    ofm_psum.add(idx_pr)
                else:
                    read = 1
                    write = 0

                # PE.
                cnt_pr = (1, 1)

                # REGF.
                cnt_pr = regfs[dce].do_access(idx_pr, cnt_pr, read, write)
                if not any(cnt_pr):
                    continue

                # GBUF.
                cnt_pr = gbufs[dce].do_access(idx_pr, cnt_pr, read, write)
                if not any(cnt_pr):
                    continue

                # DRAM.
                cnt_pr = drams[dce].do_access(idx_pr, cnt_pr, read, write)
                if not any(cnt_pr):
                    continue

        dram_access = [drams[dce].access_size() for dce in range(de.NUM)]
        gbuf_access = [gbufs[dce].access_size() for dce in range(de.NUM)]

        # Sum over all nodes.
        dram_access = [a * lbs.num_nodes // r for a, r
                       in zip(dram_access, lbs.accfwd_reduction)]
        gbuf_access = [a * lbs.num_nodes for a in gbuf_access]

        # Buffer sharing.
        if get_bufshr:
            rotation_access = [gbufs[dce].rotation_access_size()
                               * (lbs.num_nodes // subgrp_size[dce])
                               for dce in range(de.NUM)]
            wide_fetch_access = [gbufs[dce].wide_fetch_access_size()
                                 * (lbs.num_nodes // subgrp_size[dce])
                                 for dce in range(de.NUM)]
            rotation_rounds = [gbufs[dce].rotation_rounds()
                               for dce in range(de.NUM)]

            return dram_access, gbuf_access, \
                    (rotation_access, wide_fetch_access, rotation_rounds)

        for dce in range(de.NUM):
            self.assertAlmostEqual(gbufs[dce].rotation_access_size(), 0,
                                   msg='_sim_access_conv: non-0 '
                                       'rotation access with no bufshr.')
            self.assertAlmostEqual(gbufs[dce].wide_fetch_access_size(), 0,
                                   msg='_sim_access_conv: non-0 '
                                       'wide fetch access with no bufshr.')
            self.assertEqual(gbufs[dce].rotation_rounds(), 0,
                             msg='_sim_access_conv: non-0 '
                                 'rotation rounds with no bufshr.')

        return dram_access, gbuf_access

    def _average_neighbor_nhops(self, bufshr, subgrp_size):
        ''' Get the average neighbor number of hops. '''

        avg_nbr_nhops = []

        for dce in range(de.NUM):
            # pylint: disable=protected-access

            subgrp_dim, idx_pr = bufshr._subgrp_dim(dce, subgrp_size[dce])
            nbr_dist = bufshr.nbr_dists[dce]

            d_pr = subgrp_dim[idx_pr]
            d_npr = subgrp_dim[1 - idx_pr]
            n_pr = (d_pr - 1) * d_npr
            n_npr = d_npr - 1
            nhops_nbr = bufshr._nhops_with_neighbor_dist(
                dce,
                PhyDim2(*[tpl[1] for tpl
                          in sorted([(idx_pr, n_pr), (1 - idx_pr, n_npr)])]))

            nhops_nbr /= 1. * subgrp_size[dce]

            coord = bufshr._coordinate(subgrp_size[dce] - 1, subgrp_dim, idx_pr)
            nhops_lpbk = bufshr._nhops_with_neighbor_dist(dce, coord)

            nhops_lpbk /= 1. * subgrp_size[dce]

            nhops = nhops_nbr + nhops_lpbk

            if subgrp_size[dce] <= 1:
                self.assertAlmostEqual(nhops, 0)
            elif subgrp_dim.size() == subgrp_size[dce]:
                self.assertTrue(min(nbr_dist) <= nhops
                                <= max(nbr_dist)
                                + 1. * sum(subgrp_dim) / subgrp_dim.size(),
                                '_average_neighbor_nhops: {}: '
                                'subgrp_size {}, subgrp_dim {}, idx_pr {}, '
                                'nbr_dist {}, nhops {} = {} + {}'
                                .format(dce, subgrp_size[dce], subgrp_dim,
                                        idx_pr, nbr_dist,
                                        nhops, nhops_nbr, nhops_lpbk))

            assert not math.isnan(nhops) and not math.isinf(nhops)
            avg_nbr_nhops.append(nhops)

        return avg_nbr_nhops

    def _verify_bufshr_stats(self, dram_access, gbuf_access, bufshr_stats,
                             lbs, bufshr, test_name):
        ''' Verify the buffer sharing stats returned by access simulation. '''

        rotation_access, wide_fetch_access, rotation_rounds = bufshr_stats

        avg_nbr_nhops = self._average_neighbor_nhops(bufshr,
                                                     lbs.bufshr_subgrp_size)

        # Mem hierarchy.
        access = lbs.get_access()

        self.assertListEqual(access[me.DRAM], dram_access,
                             'test_access: DRAM: '
                             'model {} vs. sim {}.'
                             .format(access[me.DRAM], dram_access))
        self.assertListEqual(access[me.GBUF], gbuf_access,
                             'test_access: GBUF: '
                             'model {} vs. sim {}.'
                             .format(access[me.GBUF], gbuf_access))
        self.assertListEqual(access[me.REGF],
                             [lbs.ops, lbs.ops, lbs.ops * 2])

        # NoC.
        noc_access = lbs.get_noc_access()

        for dce in range(de.NUM):
            self.assertAlmostEqual(lbs.bufshr_rotation_access[dce]
                                   + lbs.bufshr_wide_fetch_access[dce],
                                   noc_access[dce])

        for dce in range(de.NUM):
            if lbs.bufshr_subgrp_size[dce] <= 1:
                self.assertAlmostEqual(noc_access[dce], 0)

        for dce in range(de.NUM):
            self.assertAlmostEqual(lbs.bufshr_rot_round_cnt[dce],
                                   rotation_rounds[dce],
                                   msg=('{}: mismatch rotation round count '
                                        'at {}:\nmodel: {}; sim: {}.'
                                        .format(test_name, dce,
                                                lbs.bufshr_rot_round_cnt,
                                                rotation_rounds)))

        for dce in range(de.NUM):
            self.assertAlmostEqual(lbs.bufshr_rotation_access[dce],
                                   rotation_access[dce] * avg_nbr_nhops[dce],
                                   msg=('{}: mismatch NoC rotation access '
                                        'at {}:\nmodel: {}; sim: {} x {}.'
                                        .format(test_name, dce,
                                                lbs.bufshr_rotation_access,
                                                rotation_access,
                                                avg_nbr_nhops)))

        for dce in range(de.NUM):
            self.assertAlmostEqual(lbs.bufshr_wide_fetch_access[dce],
                                   wide_fetch_access[dce] * avg_nbr_nhops[dce],
                                   msg=('{}: mismatch NoC wide fetch access '
                                        'at {}:\nmodel: {}; sim: {} x {}.'
                                        .format(test_name, dce,
                                                lbs.bufshr_wide_fetch_access,
                                                wide_fetch_access,
                                                avg_nbr_nhops)))


    def _regularized_scheme(self, bl_ts, bl_ords):
        ''' Get the regularized scheme which will not be skipped. '''

        assert isinstance(bl_ts, tuple) and isinstance(bl_ords, tuple)
        assert all(isinstance(t, tuple) for t in bl_ts)
        assert all(isinstance(o, tuple) for o in bl_ords)

        reg_lpts = [[] for _ in range(le.NUM)]
        reg_ords = tuple()

        outer_level_innermost_loop = None

        for t_, ord_ in itertools.zip_longest(bl_ts, bl_ords, fillvalue=None):

            # Non-trivial loops and trivial loops of this level.
            ntlp_list = sorted(lpe for lpe in range(le.NUM)
                               if t_[lpe] > 1)
            trlp_list = sorted(lpe for lpe in range(le.NUM)
                               if lpe not in ntlp_list)

            # Innermost non-trivial loop.
            try:
                ntlp_innermost = min(ntlp_list,
                                     key=lambda lpe, o=ord_: o[lpe])
            except (ValueError, TypeError):
                # All trivial loops or no order (last level).
                assert not ntlp_list or not ord_
                ntlp_innermost = None

            if ord_:
                # Order trivial and non-trivial loops separately of this level.
                reg_ord = [None] * le.NUM
                # Innermost loop.
                try:
                    reg_ord[ntlp_innermost] = 0
                    o = 1
                except TypeError:
                    o = 0
                # First non-trivial loops (inner), then trivial loops (outer).
                for lpe in ntlp_list + trlp_list:
                    if lpe == ntlp_innermost:
                        continue
                    reg_ord[lpe] = o
                    o += 1
                assert o == le.NUM

                # Loop orders.
                reg_ords += (tuple(reg_ord),)

            # Blocking factors.
            for lpe in range(le.NUM):
                reg_lpts[lpe].append(t_[lpe])

            if ntlp_list:
                if outer_level_innermost_loop != ntlp_innermost \
                        and outer_level_innermost_loop in ntlp_list:
                    # Adjust blocking factors by merging two adjacent loops to
                    # the outer one.
                    lpe = outer_level_innermost_loop
                    reg_lpts[lpe][-2] *= reg_lpts[lpe][-1]
                    reg_lpts[lpe][-1] = 1

                outer_level_innermost_loop = ntlp_innermost

        reg_ts = tuple(zip(*reg_lpts))

        if reg_ts == bl_ts and reg_ords == bl_ords:
            return reg_ts, reg_ords

        # Recursive call, since loop merging/reordering may cause further loop
        # merging/reordering.
        return self._regularized_scheme(reg_ts, reg_ords)