""" VGSL plumbing """ from future.utils import PY2 import io import re import sys import json import gzip import torch import logging from torch import nn from typing import Sequence, List, Tuple, Union, Optional, Iterable, Callable, Dict, Any import kraken.lib.lstm from kraken.lib import layers from kraken.lib import clstm_pb2 from kraken.lib import pyrnn_pb2 from kraken.lib.codec import PytorchCodec from kraken.lib.exceptions import KrakenInvalidModelException from coremltools.models import MLModel from coremltools.models import datatypes from coremltools.models.neural_network import NeuralNetworkBuilder from google.protobuf.message import DecodeError # all tensors are ordered NCHW, the "feature" dimension is C, so the output of # an LSTM will be put into C same as the filters of a CNN. __all__ = ['TorchVGSLModel'] logger = logging.getLogger(__name__) class TorchVGSLModel(object): """ Class building a torch module from a VSGL spec. The initialized class will contain a variable number of layers and a loss function. Inputs and outputs are always 4D tensors in order (batch, channels, height, width) with channels always being the feature dimension. Importantly this means that a recurrent network will be fed the channel vector at each step along its time axis, i.e. either put the non-time-axis dimension into the channels dimension or use a summarizing RNN squashing the time axis to 1 and putting the output into the channels dimension respectively. Attributes: input (tuple): Expected input tensor as a 4-tuple. nn (torch.nn.Sequential): Stack of layers parsed from the spec. criterion (torch.nn.Module): Fully parametrized loss function. user_metdata (dict): dict with user defined metadata. Is flushed into model file during saving/overwritten by loading operations. one_channel_mode (str): Field indicating the image type used during training of one-channel images. Is '1' for models trained on binarized images, 'L' for grayscale, and None otherwise. """ def __init__(self, spec: str) -> None: """ Constructs a torch module from a (subset of) VSGL spec. Args: spec (str): Model definition similar to tesseract as follows: ============ FUNCTIONAL OPS ============ C(s|t|r|l|m)[{name}]<y>,<x>,<d>[,<y_stride>,<x_stride>] Convolves using a y,x window, with no shrinkage, SAME infill, d outputs, with s|t|r|l|m non-linear layer. (s|t|r|l|m) specifies the type of non-linearity: s = sigmoid t = tanh r = relu l = linear (i.e., None) m = softmax L(f|r|b)(x|y)[s][{name}]<n> LSTM cell with n outputs. f runs the LSTM forward only. r runs the LSTM reversed only. b runs the LSTM bidirectionally. x runs the LSTM in the x-dimension (on data with or without the y-dimension). y runs the LSTM in the y-dimension (data must have a y dimension). s (optional) summarizes the output in the requested dimension, outputting only the final step, collapsing the dimension to a single element. Examples: Lfx128 runs a forward-only LSTM in the x-dimension with 128 outputs, treating any y dimension independently. Lfys64 runs a forward-only LSTM in the y-dimension with 64 outputs and collapses the y-dimension to 1 element. G(f|r|b)(x|y)[s][{name}]<n> GRU cell with n outputs. Arguments are equivalent to LSTM specs. Do[{name}] Insert a 1D dropout layer with 0.5 drop probability. ============ PLUMBING OPS ============ [...] Execute ... networks in series (layers). Mp[{name}]<y>,<x>[<y_stride>,<x_stride>] Maxpool the input, reducing the (y,x) rectangle to a single vector value. S[{name}]<d>(<a>x<b>)<e>,<f> Splits one dimension, moves one part to another dimension. """ self.spec = spec self.named_spec = [] # type: List[str] self.ops = [self.build_rnn, self.build_dropout, self.build_maxpool, self.build_conv, self.build_output, self.build_reshape, self.build_groupnorm] self.codec = None # type: Optional[PytorchCodec] self.criterion = None # type: Any self.nn = layers.MultiParamSequential() self.user_metadata = {'accuracy': [], 'seg_type': None, 'one_channel_mode': None, 'model_type': None, 'hyper_params': {}} # type: dict[str, str] self.idx = -1 spec = spec.strip() if spec[0] != '[' or spec[-1] != ']': raise ValueError('Non-sequential models not supported') spec = spec[1:-1] blocks = spec.split(' ') self.named_spec.append(blocks[0]) pattern = re.compile(r'(\d+),(\d+),(\d+),(\d+)') m = pattern.match(blocks.pop(0)) if not m: raise ValueError('Invalid input spec.') batch, height, width, channels = [int(x) for x in m.groups()] self.input = (batch, channels, height, width) self._parse(self.input, blocks) def _parse(self, input: Tuple[int, int, int, int], blocks: Sequence[str]) -> None: """ Parses VGSL spec and appends layers to self.nn """ logger.debug('layer\t\ttype\tparams') for block in blocks: oshape = None layer = None for op in self.ops: oshape, name, layer = op(input, block) if oshape: break if oshape: input = oshape self.named_spec.append(self.set_layer_name(block, name)) # type: ignore self.nn.add_module(name, layer) else: raise ValueError('{} invalid layer definition'.format(block)) self.output = oshape def append(self, idx: int, spec: str) -> None: """ Splits a model at layer `idx` and append layers `spec`. New layers are initialized using the init_weights method. Args: idx (int): Index of layer to append spec to starting with 1. To select the whole layer stack set idx to None. spec (str): VGSL spec without input block to append to model. """ self.nn = self.nn[:idx] self.idx = idx-1 spec = spec[1:-1] blocks = spec.split(' ') self.named_spec = self.named_spec[:idx+1] self._parse(self.nn[-1].output_shape, blocks) self.spec = '[' + ' '.join(self.named_spec) + ']' self.init_weights(slice(idx, -1)) def to(self, device: Union[str, torch.device]) -> None: self.nn = self.nn.to(device) if self.criterion: self.criterion = self.criterion.to(device) def eval(self) -> None: """ Sets the model to evaluation/inference mode, disabling dropout and gradient calculation. """ self.nn.eval() torch.set_grad_enabled(False) def train(self) -> None: """ Sets the model to training mode (enables dropout layers and disables softmax on CTC layers). """ self.nn.train() # set last layer back to eval mode if not CTC output layer # (log_softmax/softmax switch). if not self.criterion: self.nn[-1].eval() torch.set_grad_enabled(True) def set_num_threads(self, num: int) -> None: """ Sets number of OpenMP threads to use. """ torch.set_num_threads(num) @classmethod def load_pyrnn_model(cls, path: str): """ Loads an pyrnn model to VGSL. """ if not PY2: raise KrakenInvalidModelException('Loading pickle models is not supported on python 3') import cPickle def find_global(mname, cname): aliases = { 'lstm.lstm': kraken.lib.lstm, 'ocrolib.lstm': kraken.lib.lstm, 'ocrolib.lineest': kraken.lib.lineest, } if mname in aliases: return getattr(aliases[mname], cname) return getattr(sys.modules[mname], cname) of = io.open if path.endswith('.gz'): of = gzip.open with io.BufferedReader(of(path, 'rb')) as fp: unpickler = cPickle.Unpickler(fp) unpickler.find_global = find_global try: net = unpickler.load() except Exception as e: raise KrakenInvalidModelException(str(e)) if not isinstance(net, kraken.lib.lstm.SeqRecognizer): raise KrakenInvalidModelException('Pickle is %s instead of ' 'SeqRecognizer' % type(net).__name__) # extract codec codec = PytorchCodec({k: [v] for k, v in net.codec.char2code.items()}) input = net.Ni parallel, softmax = net.lstm.nets fwdnet, revnet = parallel.nets revnet = revnet.net hidden = fwdnet.WGI.shape[0] # extract weights weightnames = ('WGI', 'WGF', 'WCI', 'WGO', 'WIP', 'WFP', 'WOP') fwd_w = [] rev_w = [] for w in weightnames: fwd_w.append(torch.Tensor(getattr(fwdnet, w))) rev_w.append(torch.Tensor(getattr(revnet, w))) t = torch.cat(fwd_w[:4]) weight_ih_l0 = t[:, :input+1] weight_hh_l0 = t[:, input+1:] t = torch.cat(rev_w[:4]) weight_ih_l0_rev = t[:, :input+1] weight_hh_l0_rev = t[:, input+1:] weight_lin = torch.Tensor(softmax.W2) # build vgsl spec and set weights nn = cls('[1,1,0,{} Lbxo{} O1ca{}]'.format(input, hidden, len(net.codec.code2char))) nn.nn.L_0.layer.weight_ih_l0 = torch.nn.Parameter(weight_ih_l0) nn.nn.L_0.layer.weight_hh_l0 = torch.nn.Parameter(weight_hh_l0) nn.nn.L_0.layer.weight_ih_l0_reverse = torch.nn.Parameter(weight_ih_l0_rev) nn.nn.L_0.layer.weight_hh_l0_reverse = torch.nn.Parameter(weight_hh_l0_rev) nn.nn.L_0.layer.weight_ip_l0 = torch.nn.Parameter(fwd_w[4]) nn.nn.L_0.layer.weight_fp_l0 = torch.nn.Parameter(fwd_w[5]) nn.nn.L_0.layer.weight_op_l0 = torch.nn.Parameter(fwd_w[6]) nn.nn.L_0.layer.weight_ip_l0_reverse = torch.nn.Parameter(rev_w[4]) nn.nn.L_0.layer.weight_fp_l0_reverse = torch.nn.Parameter(rev_w[5]) nn.nn.L_0.layer.weight_op_l0_reverse = torch.nn.Parameter(rev_w[6]) nn.nn.O_1.lin.weight = torch.nn.Parameter(weight_lin) nn.add_codec(codec) return nn @classmethod def load_pronn_model(cls, path: str): """ Loads an pronn model to VGSL. """ with open(path, 'rb') as fp: net = pyrnn_pb2.pyrnn() try: net.ParseFromString(fp.read()) except Exception: raise KrakenInvalidModelException('File does not contain valid proto msg') if not net.IsInitialized(): raise KrakenInvalidModelException('Model incomplete') # extract codec codec = PytorchCodec(net.codec) input = net.ninput hidden = net.fwdnet.wgi.dim[0] # extract weights weightnames = ('wgi', 'wgf', 'wci', 'wgo', 'wip', 'wfp', 'wop') fwd_w = [] rev_w = [] for w in weightnames: fwd_ar = getattr(net.fwdnet, w) rev_ar = getattr(net.revnet, w) fwd_w.append(torch.Tensor(fwd_ar.value).view(list(fwd_ar.dim))) rev_w.append(torch.Tensor(rev_ar.value).view(list(rev_ar.dim))) t = torch.cat(fwd_w[:4]) weight_ih_l0 = t[:, :input+1] weight_hh_l0 = t[:, input+1:] t = torch.cat(rev_w[:4]) weight_ih_l0_rev = t[:, :input+1] weight_hh_l0_rev = t[:, input+1:] weight_lin = torch.Tensor(net.softmax.w2.value).view(list(net.softmax.w2.dim)) # build vgsl spec and set weights nn = cls('[1,1,0,{} Lbxo{} O1ca{}]'.format(input, hidden, len(net.codec))) nn.nn.L_0.layer.weight_ih_l0 = torch.nn.Parameter(weight_ih_l0) nn.nn.L_0.layer.weight_hh_l0 = torch.nn.Parameter(weight_hh_l0) nn.nn.L_0.layer.weight_ih_l0_reverse = torch.nn.Parameter(weight_ih_l0_rev) nn.nn.L_0.layer.weight_hh_l0_reverse = torch.nn.Parameter(weight_hh_l0_rev) nn.nn.L_0.layer.weight_ip_l0 = torch.nn.Parameter(fwd_w[4]) nn.nn.L_0.layer.weight_fp_l0 = torch.nn.Parameter(fwd_w[5]) nn.nn.L_0.layer.weight_op_l0 = torch.nn.Parameter(fwd_w[6]) nn.nn.L_0.layer.weight_ip_l0_reverse = torch.nn.Parameter(rev_w[4]) nn.nn.L_0.layer.weight_fp_l0_reverse = torch.nn.Parameter(rev_w[5]) nn.nn.L_0.layer.weight_op_l0_reverse = torch.nn.Parameter(rev_w[6]) nn.nn.O_1.lin.weight = torch.nn.Parameter(weight_lin) nn.add_codec(codec) return nn @classmethod def load_clstm_model(cls, path: str): """ Loads an CLSTM model to VGSL. """ net = clstm_pb2.NetworkProto() with open(path, 'rb') as fp: try: net.ParseFromString(fp.read()) except Exception: raise KrakenInvalidModelException('File does not contain valid proto msg') if not net.IsInitialized(): raise KrakenInvalidModelException('Model incomplete') input = net.ninput attrib = {a.key: a.value for a in list(net.attribute)} # mainline clstm model if len(attrib) > 1: mode = 'clstm' else: mode = 'clstm_compat' # extract codec codec = PytorchCodec([''] + [chr(x) for x in net.codec[1:]]) # separate layers nets = {} nets['softm'] = [n for n in list(net.sub) if n.kind == 'SoftmaxLayer'][0] parallel = [n for n in list(net.sub) if n.kind == 'Parallel'][0] nets['lstm1'] = [n for n in list(parallel.sub) if n.kind.startswith('NPLSTM')][0] rev = [n for n in list(parallel.sub) if n.kind == 'Reversed'][0] nets['lstm2'] = rev.sub[0] hidden = int(nets['lstm1'].attribute[0].value) weights = {} # type: Dict[str, torch.Tensor] for n in nets: weights[n] = {} for w in list(nets[n].weights): weights[n][w.name] = torch.Tensor(w.value).view(list(w.dim)) if mode == 'clstm_compat': weightnames = ('.WGI', '.WGF', '.WCI', '.WGO') weightname_softm = '.W' else: weightnames = ('WGI', 'WGF', 'WCI', 'WGO') weightname_softm = 'W1' # input hidden and hidden-hidden weights are in one matrix. also # CLSTM/ocropy likes 1-augmenting every other tensor so the ih weights # are input+1 in one dimension. t = torch.cat(list(w for w in [weights['lstm1'][wn] for wn in weightnames])) weight_ih_l0 = t[:, :input+1] weight_hh_l0 = t[:, input+1:] t = torch.cat(list(w for w in [weights['lstm2'][wn] for wn in weightnames])) weight_ih_l0_rev = t[:, :input+1] weight_hh_l0_rev = t[:, input+1:] weight_lin = weights['softm'][weightname_softm] if mode == 'clstm_compat': weight_lin = torch.cat([torch.zeros(len(weight_lin), 1), weight_lin], 1) # build vgsl spec and set weights nn = cls('[1,1,0,{} Lbxc{} O1ca{}]'.format(input, hidden, len(net.codec))) nn.nn.L_0.layer.weight_ih_l0 = torch.nn.Parameter(weight_ih_l0) nn.nn.L_0.layer.weight_hh_l0 = torch.nn.Parameter(weight_hh_l0) nn.nn.L_0.layer.weight_ih_l0_reverse = torch.nn.Parameter(weight_ih_l0_rev) nn.nn.L_0.layer.weight_hh_l0_reverse = torch.nn.Parameter(weight_hh_l0_rev) nn.nn.O_1.lin.weight = torch.nn.Parameter(weight_lin) nn.add_codec(codec) return nn @classmethod def load_model(cls, path: str): """ Deserializes a VGSL model from a CoreML file. Args: path (str): CoreML file Returns: A TorchVGSLModel instance. Raises: KrakenInvalidModelException if the model data is invalid (not a string, protobuf file, or without appropriate metadata). FileNotFoundError if the path doesn't point to a file. """ try: mlmodel = MLModel(path) except TypeError as e: raise KrakenInvalidModelException(str(e)) except DecodeError as e: raise KrakenInvalidModelException('Failure parsing model protobuf: {}'.format(str(e))) if 'vgsl' not in mlmodel.user_defined_metadata: raise KrakenInvalidModelException('No VGSL spec in model metadata') vgsl_spec = mlmodel.user_defined_metadata['vgsl'] nn = cls(vgsl_spec) for name, layer in nn.nn.named_children(): layer.deserialize(name, mlmodel.get_spec()) if 'codec' in mlmodel.user_defined_metadata: nn.add_codec(PytorchCodec(json.loads(mlmodel.user_defined_metadata['codec']))) nn.user_metadata = {'accuracy': [], 'seg_type': 'bbox', 'one_channel_mode': '1', 'model_type': None, 'hyper_params': {}} # type: dict[str, str] if 'kraken_meta' in mlmodel.user_defined_metadata: nn.user_metadata.update(json.loads(mlmodel.user_defined_metadata['kraken_meta'])) return nn @property def one_channel_mode(self): return self.user_metadata['one_channel_mode'] @one_channel_mode.setter def one_channel_mode(self, val: str): if val not in ['1', 'L', None]: raise ValueError('one_channel_mode {} is not one of [1, L, None]'.format(val)) self.user_metadata['one_channel_mode'] = val @property def model_type(self): return self.user_metadata['model_type'] @model_type.setter def model_type(self, val: str): if val not in ['recognition', 'segmentation']: raise ValueError('model_type {} is not one of [recognition, segmentation]'.format(val)) self.user_metadata['model_type'] = val @property def seg_type(self): return self.user_metadata['seg_type'] @seg_type.setter def seg_type(self, val: str): if val not in ['bbox', 'baselines', None]: raise ValueError('segmentation type {} is not one of [bbox, baselines, None]'.format(val)) self.user_metadata['seg_type'] = val @property def hyper_params(self, **kwargs): return self.user_metadata['hyper_params'] @hyper_params.setter def hyper_params(self, val: Dict[str, Any]): self.user_metadata['hyper_params'].update(val) def save_model(self, path: str): """ Serializes the model into path. Args: path (str): Target destination """ inputs = [('input', datatypes.Array(*self.input))] outputs = [('output', datatypes.Array(*self.output))] net_builder = NeuralNetworkBuilder(inputs, outputs) input = 'input' prev_device = next(next(self.nn.children()).parameters()).device try: for name, layer in self.nn.to('cpu').named_children(): input = layer.serialize(name, input, net_builder) mlmodel = MLModel(net_builder.spec) mlmodel.short_description = 'kraken recognition model' mlmodel.user_defined_metadata['vgsl'] = '[' + ' '.join(self.named_spec) + ']' if self.codec: mlmodel.user_defined_metadata['codec'] = json.dumps(self.codec.c2l) if self.user_metadata: mlmodel.user_defined_metadata['kraken_meta'] = json.dumps(self.user_metadata) mlmodel.save(path) finally: self.nn.to(prev_device) def add_codec(self, codec: PytorchCodec) -> None: """ Adds a PytorchCodec to the model. """ self.codec = codec def init_weights(self, idx: slice = slice(0, None)) -> None: """ Initializes weights for all or a subset of layers in the graph. LSTM/GRU layers are orthogonally initialized, convolutional layers uniformly from (-0.1,0.1). Args: idx (slice): A slice object representing the indices of layers to initialize. """ def _wi(m): if isinstance(m, torch.nn.Linear): torch.nn.init.xavier_uniform_(m.weight.data) torch.nn.init.constant_(m.bias.data, 0) elif isinstance(m, torch.nn.LSTM): for p in m.parameters(): # weights if p.data.dim() == 2: torch.nn.init.orthogonal_(p.data) # initialize biases to 1 (jozefowicz 2015) else: torch.nn.init.constant_(p.data[len(p)//4:len(p)//2], 1.0) elif isinstance(m, torch.nn.GRU): for p in m.parameters(): torch.nn.init.orthogonal_(p.data) elif isinstance(m, torch.nn.Conv2d): for p in m.parameters(): torch.nn.init.uniform_(p.data, -0.1, 0.1) self.nn[idx].apply(_wi) @staticmethod def set_layer_name(layer: str, name: str) -> str: """ Sets the name field of an VGSL layer definition. Args: layer (str): VGSL definition name (str): Layer name """ if '{' in layer and '}' in layer: return layer lsplits = re.split(r'(^[^\d]+)', layer) lsplits.insert(-1, '{{{}}}'.format(name)) return ''.join(lsplits) def get_layer_name(self, layer: str, name: Optional[str] = None) -> str: """ Generates a unique identifier for the layer optionally using a supplied name. Args: layer (str): Identifier of the layer type name (str): user-supplied {name} with {} that need removing. Returns: (str) network unique layer name """ self.idx += 1 if name: return name[1:-1] else: return '{}_{}'.format(re.sub(r'\W+', '_', layer), self.idx) def resize_output(self, output_size: int, del_indices: Optional[Iterable] = None) -> None: """ Resizes an output linear projection layer. Args: output_size (int): New size of the linear layer del_indices (list): list of outputs to delete from layer """ if not isinstance(self.nn[-1], layers.LinSoftmax): raise ValueError('last layer is not linear projection') logger.debug('Resizing output LinSoftmax layer to {}'.format(output_size)) self.nn[-1].resize(output_size, del_indices) pattern = re.compile(r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)') m = pattern.match(self.named_spec[-1]) if not m: raise ValueError('Output specification is not parsable') aug = m.group('aug') if m.group('aug') else '' self.named_spec[-1] = 'O{}{}{}{}{}'.format(m.group('name'), m.group('dim'), m.group('type'), aug, output_size) self.spec = '[' + ' '.join(self.named_spec) + ']' def build_rnn(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds an LSTM/GRU layer returning number of outputs and layer. """ pattern = re.compile(r'(?P<type>L|G)(?P<dir>f|r|b)(?P<dim>x|y)(?P<sum>s)?(?P<legacy>c|o)?(?P<name>{\w+})?(?P<out>\d+)') m = pattern.match(block) if not m: return None, None, None type = m.group('type') direction = m.group('dir') dim = m.group('dim') == 'y' summarize = m.group('sum') == 's' legacy = None if m.group('legacy') == 'c': legacy = 'clstm' elif m.group('legacy') == 'o': legacy = 'ocropy' hidden = int(m.group(7)) fn = layers.TransposedSummarizingRNN(input[1], hidden, direction, dim, summarize, legacy) logger.debug('{}\t\trnn\tdirection {} transposed {} summarize {} out {} legacy {}'.format(self.idx+1, direction, dim, summarize, hidden, legacy)) return fn.get_shape(input), self.get_layer_name(type, m.group('name')), fn def build_dropout(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: pattern = re.compile(r'(?P<type>Do)(?P<name>{\w+})?(?P<p>(\d+(\.\d*)?|\.\d+))?(,(?P<dim>\d+))?') m = pattern.match(block) if not m: return None, None, None prob = float(m.group('p')) if m.group('p') else 0.5 dim = int(m.group('dim')) if m.group('dim') else 1 fn = layers.Dropout(prob, dim) logger.debug('{}\t\tdropout\tprobability {} dims {}'.format(self.idx+1, prob, dim)) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn def build_groupnorm(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: pattern = re.compile(r'(?P<type>Gn)(?P<name>{\w+})?(?P<groups>\d+)') m = pattern.match(block) if not m: return None, None, None groups = int(m.group('groups')) fn = layers.GroupNorm(input[1], groups) logger.debug('{}\t\tgroupnorm\tgroups {}'.format(self.idx+1, groups)) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn def build_conv(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds a 2D convolution layer. """ pattern = re.compile(r'(?P<type>C)(?P<nl>s|t|r|l|m)(?P<name>{\w+})?(\d+),(\d+),(?P<out>\d+)(,(?P<stride_y>\d+),(?P<stride_x>\d+))?') m = pattern.match(block) if not m: return None, None, None kernel_size = (int(m.group(4)), int(m.group(5))) filters = int(m.group('out')) stride = (int(m.group('stride_y')), int(m.group('stride_x'))) if m.group('stride_x') else (1, 1) nl = m.group('nl') fn = layers.ActConv2D(input[1], filters, kernel_size, stride, nl) logger.debug('{}\t\tconv\tkernel {} x {} filters {} stride {} activation {}'.format(self.idx+1, kernel_size[0], kernel_size[1], filters, stride, nl)) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn def build_maxpool(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds a maxpool layer. """ pattern = re.compile(r'(?P<type>Mp)(?P<name>{\w+})?(\d+),(\d+)(?:,(\d+),(\d+))?') m = pattern.match(block) if not m: return None, None, None kernel_size = (int(m.group(3)), int(m.group(4))) stride = (kernel_size[0] if not m.group(5) else int(m.group(5)), kernel_size[1] if not m.group(6) else int(m.group(6))) fn = layers.MaxPool(kernel_size, stride) logger.debug('{}\t\tmaxpool\tkernel {} x {} stride {} x {}'.format(self.idx+1, kernel_size[0], kernel_size[1], stride[0], stride[1])) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn def build_reshape(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds a reshape layer """ pattern = re.compile(r'(?P<type>S)(?P<name>{\w+})?(?P<dim>\d+)\((?P<part_a>\d+)x(?P<part_b>\d+)\)(?P<high>\d+),(?P<low>\d+)') m = pattern.match(block) if not m: return None, None, None src_dim = int(m.group('dim')) part_a = int(m.group('part_a')) part_b = int(m.group('part_b')) high = int(m.group('high')) low = int(m.group('low')) dim_map = {0: 0, 1: 2, 2: 3, 3: 1} if part_a == 0: part_a = -1 if part_b == 0: part_b = -1 if src_dim != high and src_dim != low: raise ValueError('Either high ({}) or low ({}) must be source dimension ({})'.format(high, low, src_dim)) if part_a == 0 or part_b == 0: raise ValueError('Expected non-zero size for part_a ({}) or part_b ({})'.format(part_a, part_b)) if part_a == -1 and part_b == -1: raise ValueError('Only one size may be -1') logger.debug('{}\t\treshape from {} {} x {} to {}/{}'.format(self.idx+1, src_dim, part_a, part_b, high, low)) src_dim = dim_map[src_dim] high = dim_map[high] low = dim_map[low] fn = layers.Reshape(src_dim, part_a, part_b, high, low) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn def build_output(self, input: Tuple[int, int, int, int], block: str) -> Union[Tuple[None, None, None], Tuple[Tuple[int, int, int, int], str, Callable]]: """ Builds an output layer. """ pattern = re.compile(r'(O)(?P<name>{\w+})?(?P<dim>2|1|0)(?P<type>l|s|c)(?P<aug>a)?(?P<out>\d+)') m = pattern.match(block) if not m: return None, None, None dim = int(m.group('dim')) nl = m.group('type') outdim = int(m.group('out')) if dim == 0: raise ValueError('categorical output not supported, yet.') if nl == 'c' and dim == 2: raise ValueError('CTC not supported for heatmap output') if nl in ['l', 's'] and int(m.group('out')) >= 1: self.criterion = nn.BCELoss() elif nl == 'c': self.criterion = nn.CTCLoss(reduction='sum', zero_infinity=True) else: raise ValueError('unsupported output specification') # heatmap output if dim == 2: act = 's' if nl == 'l' else 'm' fn = layers.ActConv2D(input[1], outdim, (1, 1), (1, 1), act) logger.debug('{}\t\tconv\tkernel 1 x 1 filters {} stride 1 activation {}'.format(self.idx+1, outdim, nl)) return fn.get_shape(input), self.get_layer_name(m.group('type'), m.group('name')), fn else: aug = True if m.group('aug') else False lin = layers.LinSoftmax(input[1], int(m.group('out')), aug) logger.debug('{}\t\tlinear\taugmented {} out {}'.format(self.idx+1, aug, m.group('out'))) return lin.get_shape(input), self.get_layer_name(m.group(1), m.group('name')), lin