# Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import math import torch import torch.nn as nn import torch.nn.functional as F from fairseq import options from fairseq import utils from fairseq.modules import ( AdaptiveInput, AdaptiveSoftmax, CharacterTokenEmbedder, LearnedPositionalEmbedding, MultiheadAttention, SinusoidalPositionalEmbedding, DynamicConv1dTBC, LightweightConv1dTBC ) from . import ( FairseqIncrementalDecoder, FairseqEncoder, FairseqLanguageModel, FairseqModel, register_model, register_model_architecture, ) @register_model('lightconv') class LightConvModel(FairseqModel): """ LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019) <https://openreview.net/pdf?id=SkVhlh09tX>`_. To use LightConv please set --encoder-conv-type lightweight --decoder-conv-type lightweight To use DynamicConv please set --encoder-conv-type dynamic --decoder-conv-type dynamic Args: encoder (LightConvEncoder): the encoder decoder (LightConvDecoder): the decoder The LightConv model provides the following named architectures and command-line arguments: .. argparse:: :ref: fairseq.models.lightconv_parser :prog: """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-conv-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-conv-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') """LightConv and DynamicConv arguments""" parser.add_argument('--encoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")') parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--encoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--decoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--encoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not hasattr(args, 'max_source_positions'): args.max_source_positions = 1024 if not hasattr(args, 'max_target_positions'): args.max_target_positions = 1024 src_dict, tgt_dict = task.source_dictionary, task.target_dictionary def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb if args.share_all_embeddings: if src_dict != tgt_dict: raise RuntimeError('--share-all-embeddings requires a joined dictionary') if args.encoder_embed_dim != args.decoder_embed_dim: raise RuntimeError( '--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim') if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path): raise RuntimeError('--share-all-embeddings not compatible with --decoder-embed-path') encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = build_embedding( tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) encoder = LightConvEncoder(args, src_dict, encoder_embed_tokens) decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens) return LightConvModel(encoder, decoder) @register_model('lightconv_lm') class LightConvLanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', default=0.1, type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--decoder-normalize-before', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', help='share decoder input and output embeddings') parser.add_argument('--character-embeddings', default=False, action='store_true', help='if set, uses character embedding convolutions to produce token embeddings') parser.add_argument('--character-filters', type=str, metavar='LIST', default='[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, help='size of character embeddings') parser.add_argument('--char-embedder-highway-layers', type=int, metavar='N', default=2, help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', default=False, action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument('--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument('--tie-adaptive-weights', action='store_true', help='if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument('--tie-adaptive-proj', action='store_true', help='if set, ties the projection weights of adaptive softmax and adaptive input') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') """LightConv and DynamicConv arguments""" parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--decoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if not hasattr(args, 'max_source_positions'): args.max_source_positions = args.tokens_per_sample if not hasattr(args, 'max_target_positions'): args.max_target_positions = args.tokens_per_sample if args.character_embeddings: embed_tokens = CharacterTokenEmbedder(task.dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput(len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, options.eval_str_list(args.adaptive_input_cutoff, type=int)) else: embed_tokens = Embedding(len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) assert args.decoder_input_dim == args.decoder_output_dim decoder = LightConvDecoder(args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False) return LightConvLanguageModel(decoder) class LightConvEncoder(FairseqEncoder): """ LightConv encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`LightConvEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding left_pad (bool, optional): whether the input is left-padded. Default: ``True`` """ def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvEncoderLayer(args, kernel_size=args.encoder_kernel_size_list[i]) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.normalize: x = self.layer_norm(x) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) class LightConvDecoder(FairseqIncrementalDecoder): """ LightConv decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`LightConvDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` left_pad (bool, optional): whether the input is left-padded. Default: ``False`` """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] class LightConvEncoderLayer(nn.Module): """Encoder layer block. Args: args (argparse.Namespace): parsed command-line arguments kernel_size: kernel size of the convolution """ def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)]) def forward(self, x, encoder_padding_mask): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(0, x, before=True) x = F.dropout(x, p=self.input_dropout, training=self.training) x = self.linear1(x) if self.act is not None: x = self.act(x) if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.transpose(0, 1).unsqueeze(2), 0) x = self.conv(x) x = self.linear2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(0, x, after=True) residual = x x = self.maybe_layer_norm(1, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(1, x, after=True) return x def maybe_layer_norm(self, i, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return self.layer_norms[i](x) else: return x def extra_repr(self): return 'dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}'.format( self.dropout, self.relu_dropout, self.input_dropout, self.normalize_before) class LightConvDecoderLayer(nn.Module): """Decoder layer block. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` kernel_size: kernel size of the convolution """ def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) elif args.decoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True def forward(self, x, encoder_out, encoder_padding_mask, incremental_state, prev_conv_state=None, prev_attn_state=None, conv_mask=None, conv_padding_mask=None): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(self.conv_layer_norm, x, before=True) if prev_conv_state is not None: if incremental_state is None: incremental_state = {} self.conv._set_input_buffer(incremental_state, prev_conv_state) x = F.dropout(x, p=self.input_dropout, training=self.training) x = self.linear1(x) if self.act is not None: x = self.act(x) x = self.conv(x, incremental_state=incremental_state) x = self.linear2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.conv_layer_norm, x, after=True) attn = None if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) if prev_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) return x, attn def maybe_layer_norm(self, layer_norm, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return layer_norm(x) else: return x def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn def extra_repr(self): return 'dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}'.format( self.dropout, self.relu_dropout, self.input_dropout, self.normalize_before) def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def LayerNorm(embedding_dim): m = nn.LayerNorm(embedding_dim) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.) return m def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad, learned=False): if learned: m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx, left_pad) nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) else: m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1) return m @register_model_architecture('lightconv_lm', 'lightconv_lm') def base_lm_architecture(args): args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 2048) args.decoder_layers = getattr(args, 'decoder_layers', 6) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) args.adaptive_softmax_factor = getattr(args, 'adaptive_softmax_factor', 4) args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) args.character_embeddings = getattr(args, 'character_embeddings', False) args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) # The model training is not stable without this args.decoder_normalize_before = True args.adaptive_input = getattr(args, 'adaptive_input', False) args.adaptive_input_factor = getattr(args, 'adaptive_input_factor', 4) args.adaptive_input_cutoff = getattr(args, 'adaptive_input_cutoff', None) args.tie_adaptive_weights = getattr(args, 'tie_adaptive_weights', False) args.tie_adaptive_proj = getattr(args, 'tie_adaptive_proj', False) args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31]) if len(args.decoder_kernel_size_list) == 1: args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers @register_model_architecture('lightconv_lm', 'lightconv_lm_gbw') def lightconv_lm_gbw(args): args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.dropout = getattr(args, 'dropout', 0.1) args.attention_dropout = getattr(args, 'attention_dropout', 0.1) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) base_lm_architecture(args) @register_model_architecture('lightconv', 'lightconv') def base_architecture(args): args.encoder_embed_path = getattr(args, 'encoder_embed_path', None) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 2048) args.encoder_layers = getattr(args, 'encoder_layers', 7) args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 8) args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False) args.encoder_learned_pos = getattr(args, 'encoder_learned_pos', False) args.decoder_embed_path = getattr(args, 'decoder_embed_path', None) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', args.encoder_ffn_embed_dim) args.decoder_layers = getattr(args, 'decoder_layers', 6) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 8) args.decoder_normalize_before = getattr(args, 'decoder_normalize_before', False) args.decoder_learned_pos = getattr(args, 'decoder_learned_pos', False) args.attention_dropout = getattr(args, 'attention_dropout', 0.) args.relu_dropout = getattr(args, 'relu_dropout', 0.) args.dropout = getattr(args, 'dropout', 0.1) args.adaptive_softmax_cutoff = getattr(args, 'adaptive_softmax_cutoff', None) args.adaptive_softmax_dropout = getattr(args, 'adaptive_softmax_dropout', 0) args.share_decoder_input_output_embed = getattr(args, 'share_decoder_input_output_embed', False) args.share_all_embeddings = getattr(args, 'share_all_embeddings', False) args.no_token_positional_embeddings = getattr(args, 'no_token_positional_embeddings', False) args.decoder_output_dim = getattr(args, 'decoder_output_dim', args.decoder_embed_dim) args.decoder_input_dim = getattr(args, 'decoder_input_dim', args.decoder_embed_dim) args.encoder_conv_dim = getattr(args, 'encoder_conv_dim', args.encoder_embed_dim) args.decoder_conv_dim = getattr(args, 'decoder_conv_dim', args.decoder_embed_dim) args.encoder_kernel_size_list = getattr(args, 'encoder_kernel_size_list', [3, 7, 15, 31, 31, 31, 31]) args.decoder_kernel_size_list = getattr(args, 'decoder_kernel_size_list', [3, 7, 15, 31, 31, 31]) if len(args.encoder_kernel_size_list) == 1: args.encoder_kernel_size_list = args.encoder_kernel_size_list * args.encoder_layers if len(args.decoder_kernel_size_list) == 1: args.decoder_kernel_size_list = args.decoder_kernel_size_list * args.decoder_layers assert len(args.encoder_kernel_size_list) == args.encoder_layers, "encoder_kernel_size_list doesn't match encoder_layers" assert len(args.decoder_kernel_size_list) == args.decoder_layers, "decoder_kernel_size_list doesn't match decoder_layers" args.encoder_glu = getattr(args, 'encoder_glu', True) args.decoder_glu = getattr(args, 'decoder_glu', True) args.input_dropout = getattr(args, 'input_dropout', 0.1) args.weight_dropout = getattr(args, 'weight_dropout', args.attention_dropout) @register_model_architecture('lightconv', 'lightconv_iwslt_de_en') def lightconv_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 512) args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 1024) args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 4) args.encoder_layers = getattr(args, 'encoder_layers', 7) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 512) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 1024) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 4) args.decoder_layers = getattr(args, 'decoder_layers', 6) args.attention_dropout = getattr(args, 'attention_dropout', 0.1) args.weight_dropout = getattr(args, 'weight_dropout', 0.1) args.encoder_glu = getattr(args, 'encoder_glu', False) args.decoder_glu = getattr(args, 'decoder_glu', False) args.input_dropout = getattr(args, 'input_dropout', 0.0) base_architecture(args) @register_model_architecture('lightconv', 'lightconv_wmt_en_de') def lightconv_wmt_en_de(args): base_architecture(args) @register_model_architecture('lightconv', 'lightconv_wmt_en_de_big') def lightconv_wmt_en_de_big(args): args.attention_dropout = getattr(args, 'attention_dropout', 0.1) args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1024) args.encoder_ffn_embed_dim = getattr(args, 'encoder_ffn_embed_dim', 4096) args.encoder_attention_heads = getattr(args, 'encoder_attention_heads', 16) args.encoder_normalize_before = getattr(args, 'encoder_normalize_before', False) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 1024) args.decoder_ffn_embed_dim = getattr(args, 'decoder_ffn_embed_dim', 4096) args.decoder_attention_heads = getattr(args, 'decoder_attention_heads', 16) args.dropout = getattr(args, 'dropout', 0.3) base_architecture(args) @register_model_architecture('lightconv', 'lightconv_wmt_en_fr_big') def lightconv_wmt_en_fr_big(args): args.dropout = getattr(args, 'dropout', 0.1) lightconv_wmt_en_de_big(args) @register_model_architecture('lightconv', 'lightconv_wmt_zh_en_big') def lightconv_wmt_zh_en_big(args): args.dropout = getattr(args, 'dropout', 0.2) args.attention_dropout = getattr(args, 'attention_dropout', 0.2) args.weight_dropout = getattr(args, 'weight_dropout', 0.2) lightconv_wmt_en_de_big(args)