Python torch.nn.GLU Examples
The following are 30
code examples of torch.nn.GLU().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.nn
, or try the search function
.
Example #1
Source File: supervised_topic_model.py From causal-text-embeddings with MIT License | 7 votes |
def get_activation(self, act): if act == 'tanh': act = nn.Tanh() elif act == 'relu': act = nn.ReLU() elif act == 'softplus': act = nn.Softplus() elif act == 'rrelu': act = nn.RReLU() elif act == 'leakyrelu': act = nn.LeakyReLU() elif act == 'elu': act = nn.ELU() elif act == 'selu': act = nn.SELU() elif act == 'glu': act = nn.GLU() else: print('Defaulting to tanh activations...') act = nn.Tanh() return act
Example #2
Source File: etm.py From ETM with MIT License | 7 votes |
def get_activation(self, act): if act == 'tanh': act = nn.Tanh() elif act == 'relu': act = nn.ReLU() elif act == 'softplus': act = nn.Softplus() elif act == 'rrelu': act = nn.RReLU() elif act == 'leakyrelu': act = nn.LeakyReLU() elif act == 'elu': act = nn.ELU() elif act == 'selu': act = nn.SELU() elif act == 'glu': act = nn.GLU() else: print('Defaulting to tanh activations...') act = nn.Tanh() return act
Example #3
Source File: fconv_self_att.py From crosentgec with GNU General Public License v3.0 | 5 votes |
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', default=0.1, type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', default='False', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', default=1, type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', default=1, type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', default='False', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', default=1, type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', default='False', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', default='False', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', default='False', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', default='', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', default='False', help='use pretrained model when training [True, ...]')
Example #4
Source File: lightconv.py From helo_word with Apache License 2.0 | 5 votes |
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
Example #5
Source File: fconv_self_att.py From helo_word with Apache License 2.0 | 5 votes |
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', help='use pretrained model when training [True, ...]') # fmt: on
Example #6
Source File: downsampled_multihead_attention.py From helo_word with Apache License 2.0 | 5 votes |
def GatedLinear(in_features, out_features, dropout=0., bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features*4, dropout, bias), nn.GLU(), Linear(out_features*2, out_features*2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias) )
Example #7
Source File: lightconv.py From attn2d with MIT License | 5 votes |
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
Example #8
Source File: fconv_self_att.py From attn2d with MIT License | 5 votes |
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', help='use pretrained model when training [True, ...]') # fmt: on
Example #9
Source File: downsampled_multihead_attention.py From attn2d with MIT License | 5 votes |
def GatedLinear(in_features, out_features, dropout=0., bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features*4, dropout, bias), nn.GLU(), Linear(out_features*2, out_features*2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias) )
Example #10
Source File: AoAModel.py From ImageCaptioning.pytorch with MIT License | 5 votes |
def __init__(self, opt): super(AoA_Decoder_Core, self).__init__() self.drop_prob_lm = opt.drop_prob_lm self.d_model = opt.rnn_size self.use_multi_head = opt.use_multi_head self.multi_head_scale = opt.multi_head_scale self.use_ctx_drop = getattr(opt, 'ctx_drop', 0) self.out_res = getattr(opt, 'out_res', 0) self.decoder_type = getattr(opt, 'decoder_type', 'AoA') self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size, opt.rnn_size) # we, fc, h^2_t-1 self.out_drop = nn.Dropout(self.drop_prob_lm) if self.decoder_type == 'AoA': # AoA layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, 2 * opt.rnn_size), nn.GLU()) elif self.decoder_type == 'LSTM': # LSTM layer self.att2ctx = nn.LSTMCell(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size) else: # Base linear layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size), nn.ReLU()) # if opt.use_multi_head == 1: # TODO, not implemented for now # self.attention = MultiHeadedAddAttention(opt.num_heads, opt.d_model, scale=opt.multi_head_scale) if opt.use_multi_head == 2: self.attention = MultiHeadedDotAttention(opt.num_heads, opt.rnn_size, project_k_v=0, scale=opt.multi_head_scale, use_output_layer=0, do_aoa=0, norm_q=1) else: self.attention = Attention(opt) if self.use_ctx_drop: self.ctx_drop = nn.Dropout(self.drop_prob_lm) else: self.ctx_drop = lambda x :x
Example #11
Source File: AoAModel.py From ImageCaptioning.pytorch with MIT License | 5 votes |
def __init__(self, h, d_model, dropout=0.1, scale=1, project_k_v=1, use_output_layer=1, do_aoa=0, norm_q=0, dropout_aoa=0.3): super(MultiHeadedDotAttention, self).__init__() assert d_model * scale % h == 0 # We assume d_v always equals d_k self.d_k = d_model * scale // h self.h = h # Do we need to do linear projections on K and V? self.project_k_v = project_k_v # normalize the query? if norm_q: self.norm = LayerNorm(d_model) else: self.norm = lambda x:x self.linears = clones(nn.Linear(d_model, d_model * scale), 1 + 2 * project_k_v) # output linear layer after the multi-head attention? self.output_layer = nn.Linear(d_model * scale, d_model) # apply aoa after attention? self.use_aoa = do_aoa if self.use_aoa: self.aoa_layer = nn.Sequential(nn.Linear((1 + scale) * d_model, 2 * d_model), nn.GLU()) # dropout to the input of AoA layer if dropout_aoa > 0: self.dropout_aoa = nn.Dropout(p=dropout_aoa) else: self.dropout_aoa = lambda x:x if self.use_aoa or not use_output_layer: # AoA doesn't need the output linear layer del self.output_layer self.output_layer = lambda x:x self.attn = None self.dropout = nn.Dropout(p=dropout)
Example #12
Source File: AoAModel.py From self-critical.pytorch with MIT License | 5 votes |
def __init__(self, opt): super(AoA_Decoder_Core, self).__init__() self.drop_prob_lm = opt.drop_prob_lm self.d_model = opt.rnn_size self.use_multi_head = opt.use_multi_head self.multi_head_scale = opt.multi_head_scale self.use_ctx_drop = getattr(opt, 'ctx_drop', 0) self.out_res = getattr(opt, 'out_res', 0) self.decoder_type = getattr(opt, 'decoder_type', 'AoA') self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size, opt.rnn_size) # we, fc, h^2_t-1 self.out_drop = nn.Dropout(self.drop_prob_lm) if self.decoder_type == 'AoA': # AoA layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, 2 * opt.rnn_size), nn.GLU()) elif self.decoder_type == 'LSTM': # LSTM layer self.att2ctx = nn.LSTMCell(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size) else: # Base linear layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size), nn.ReLU()) # if opt.use_multi_head == 1: # TODO, not implemented for now # self.attention = MultiHeadedAddAttention(opt.num_heads, opt.d_model, scale=opt.multi_head_scale) if opt.use_multi_head == 2: self.attention = MultiHeadedDotAttention(opt.num_heads, opt.rnn_size, project_k_v=0, scale=opt.multi_head_scale, use_output_layer=0, do_aoa=0, norm_q=1) else: self.attention = Attention(opt) if self.use_ctx_drop: self.ctx_drop = nn.Dropout(self.drop_prob_lm) else: self.ctx_drop = lambda x :x
Example #13
Source File: AoAModel.py From self-critical.pytorch with MIT License | 5 votes |
def __init__(self, h, d_model, dropout=0.1, scale=1, project_k_v=1, use_output_layer=1, do_aoa=0, norm_q=0, dropout_aoa=0.3): super(MultiHeadedDotAttention, self).__init__() assert d_model * scale % h == 0 # We assume d_v always equals d_k self.d_k = d_model * scale // h self.h = h # Do we need to do linear projections on K and V? self.project_k_v = project_k_v # normalize the query? if norm_q: self.norm = LayerNorm(d_model) else: self.norm = lambda x:x self.linears = clones(nn.Linear(d_model, d_model * scale), 1 + 2 * project_k_v) # output linear layer after the multi-head attention? self.output_layer = nn.Linear(d_model * scale, d_model) # apply aoa after attention? self.use_aoa = do_aoa if self.use_aoa: self.aoa_layer = nn.Sequential(nn.Linear((1 + scale) * d_model, 2 * d_model), nn.GLU()) # dropout to the input of AoA layer if dropout_aoa > 0: self.dropout_aoa = nn.Dropout(p=dropout_aoa) else: self.dropout_aoa = lambda x:x if self.use_aoa or not use_output_layer: # AoA doesn't need the output linear layer del self.output_layer self.output_layer = lambda x:x self.attn = None self.dropout = nn.Dropout(p=dropout)
Example #14
Source File: rnn.py From Global-Encoding with MIT License | 5 votes |
def __init__(self, config, embedding=None): super(rnn_encoder, self).__init__() self.embedding = embedding if embedding is not None else nn.Embedding(config.src_vocab_size, config.emb_size) self.hidden_size = config.hidden_size self.config = config if config.swish: self.sw1 = nn.Sequential(nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.BatchNorm1d(config.hidden_size), nn.ReLU()) self.sw3 = nn.Sequential(nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size)) self.sw33 = nn.Sequential(nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size)) self.linear = nn.Sequential(nn.Linear(2*config.hidden_size, 2*config.hidden_size), nn.GLU(), nn.Dropout(config.dropout)) self.filter_linear = nn.Linear(3*config.hidden_size, config.hidden_size) self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() if config.selfatt: if config.attention == 'None': self.attention = None elif config.attention == 'bahdanau': self.attention = models.bahdanau_attention(config.hidden_size, config.emb_size, config.pool_size) elif config.attention == 'luong': self.attention = models.luong_attention(config.hidden_size, config.emb_size, config.pool_size) elif config.attention == 'luong_gate': self.attention = models.luong_gate_attention(config.hidden_size, config.emb_size) if config.cell == 'gru': self.rnn = nn.GRU(input_size=config.emb_size, hidden_size=config.hidden_size, num_layers=config.enc_num_layers, dropout=config.dropout, bidirectional=config.bidirectional) else: self.rnn = nn.LSTM(input_size=config.emb_size, hidden_size=config.hidden_size, num_layers=config.enc_num_layers, dropout=config.dropout, bidirectional=config.bidirectional)
Example #15
Source File: dynamic_conv2d.py From espnet with Apache License 2.0 | 5 votes |
def __init__( self, wshare, n_feat, dropout_rate, kernel_size_str, lnum, use_kernel_mask=False, use_bias=False, ): """Construct Dynamic 2-Dimentional Convolution layer.""" super(DynamicConvolution2D, self).__init__() assert n_feat % wshare == 0 self.wshare = wshare self.use_kernel_mask = use_kernel_mask self.dropout_rate = dropout_rate self.kernel_size = int(kernel_size_str.split("_")[lnum]) self.padding_size = int(self.kernel_size / 2) self.attn_t = None self.attn_f = None # linear -> GLU -- -> lightconv -> linear # \ / # Linear self.linear1 = nn.Linear(n_feat, n_feat * 2) self.linear2 = nn.Linear(n_feat * 2, n_feat) self.linear_weight = nn.Linear(n_feat, self.wshare * 1 * self.kernel_size) nn.init.xavier_uniform(self.linear_weight.weight) self.linear_weight_f = nn.Linear(n_feat, self.kernel_size) nn.init.xavier_uniform(self.linear_weight_f.weight) self.act = nn.GLU() # dynamic conv related self.use_bias = use_bias if self.use_bias: self.bias = nn.Parameter(torch.Tensor(n_feat))
Example #16
Source File: lightconv.py From espnet with Apache License 2.0 | 5 votes |
def __init__( self, wshare, n_feat, dropout_rate, kernel_size_str, lnum, use_kernel_mask=False, use_bias=False, ): """Construct Lightweight Convolution layer.""" super(LightweightConvolution, self).__init__() assert n_feat % wshare == 0 self.wshare = wshare self.use_kernel_mask = use_kernel_mask self.dropout_rate = dropout_rate self.kernel_size = int(kernel_size_str.split("_")[lnum]) self.padding_size = int(self.kernel_size / 2) # linear -> GLU -> lightconv -> linear self.linear1 = nn.Linear(n_feat, n_feat * 2) self.linear2 = nn.Linear(n_feat, n_feat) self.act = nn.GLU() # lightconv related self.weight = nn.Parameter( torch.Tensor(self.wshare, 1, self.kernel_size).uniform_(0, 1) ) self.use_bias = use_bias if self.use_bias: self.bias = nn.Parameter(torch.Tensor(n_feat)) # mask of kernel kernel_mask0 = torch.zeros(self.wshare, int(self.kernel_size / 2)) kernel_mask1 = torch.ones(self.wshare, int(self.kernel_size / 2 + 1)) self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)
Example #17
Source File: lightconv2d.py From espnet with Apache License 2.0 | 5 votes |
def __init__( self, wshare, n_feat, dropout_rate, kernel_size_str, lnum, use_kernel_mask=False, use_bias=False, ): """Construct Lightweight 2-Dimentional Convolution layer.""" super(LightweightConvolution2D, self).__init__() assert n_feat % wshare == 0 self.wshare = wshare self.use_kernel_mask = use_kernel_mask self.dropout_rate = dropout_rate self.kernel_size = int(kernel_size_str.split("_")[lnum]) self.padding_size = int(self.kernel_size / 2) # linear -> GLU -> lightconv -> linear self.linear1 = nn.Linear(n_feat, n_feat * 2) self.linear2 = nn.Linear(n_feat * 2, n_feat) self.act = nn.GLU() # lightconv related self.weight = nn.Parameter( torch.Tensor(self.wshare, 1, self.kernel_size).uniform_(0, 1) ) self.weight_f = nn.Parameter( torch.Tensor(1, 1, self.kernel_size).uniform_(0, 1) ) self.use_bias = use_bias if self.use_bias: self.bias = nn.Parameter(torch.Tensor(n_feat)) # mask of kernel kernel_mask0 = torch.zeros(self.wshare, int(self.kernel_size / 2)) kernel_mask1 = torch.ones(self.wshare, int(self.kernel_size / 2 + 1)) self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)
Example #18
Source File: downsampled_multihead_attention.py From crosentgec with GNU General Public License v3.0 | 5 votes |
def GatedLinear(in_features, out_features, dropout=0., bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features*4, dropout, bias), nn.GLU(), Linear(out_features*2, out_features*2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias) )
Example #19
Source File: downsampled_multihead_attention.py From fairseq with MIT License | 5 votes |
def GatedLinear(in_features, out_features, dropout=0., bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features*4, dropout, bias), nn.GLU(), Linear(out_features*2, out_features*2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias) )
Example #20
Source File: fconv_self_att.py From fairseq with MIT License | 5 votes |
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', help='use pretrained model when training [True, ...]') # fmt: on
Example #21
Source File: lightconv.py From fairseq with MIT License | 5 votes |
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
Example #22
Source File: AoAModel.py From AAT with MIT License | 5 votes |
def __init__(self, h, d_model, dropout=0.1, scale=1, project_k_v=1, use_output_layer=1, do_aoa=0, norm_q=0, dropout_aoa=0.3): super(MultiHeadedDotAttention, self).__init__() assert d_model * scale % h == 0 # We assume d_v always equals d_k self.d_k = d_model * scale // h self.h = h # Do we need to do linear projections on K and V? self.project_k_v = project_k_v # normalize the query? if norm_q: self.norm = LayerNorm(d_model) else: self.norm = lambda x:x self.linears = clones(nn.Linear(d_model, d_model * scale), 1 + 2 * project_k_v) # output linear layer after the multi-head attention? self.output_layer = nn.Linear(d_model * scale, d_model) # apply aoa after attention? self.use_aoa = do_aoa if self.use_aoa: self.aoa_layer = nn.Sequential(nn.Linear((1 + scale) * d_model, 2 * d_model), nn.GLU()) # dropout to the input of AoA layer if dropout_aoa > 0: self.dropout_aoa = nn.Dropout(p=dropout_aoa) else: self.dropout_aoa = lambda x:x if self.use_aoa or not use_output_layer: # AoA doesn't need the output linear layer del self.output_layer self.output_layer = lambda x:x self.attn = None self.dropout = nn.Dropout(p=dropout)
Example #23
Source File: AoAModel.py From AAT with MIT License | 5 votes |
def __init__(self, opt): super(AoA_Decoder_Core, self).__init__() self.drop_prob_lm = opt.drop_prob_lm self.d_model = opt.rnn_size self.use_multi_head = opt.use_multi_head self.multi_head_scale = opt.multi_head_scale self.use_ctx_drop = getattr(opt, 'ctx_drop', 0) self.out_res = getattr(opt, 'out_res', 0) self.decoder_type = getattr(opt, 'decoder_type', 'AoA') self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size, opt.rnn_size) # we, fc, h^2_t-1 self.out_drop = nn.Dropout(self.drop_prob_lm) if self.decoder_type == 'AoA': # AoA layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, 2 * opt.rnn_size), nn.GLU()) elif self.decoder_type == 'LSTM': # LSTM layer self.att2ctx = nn.LSTMCell(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size) else: # Base linear layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size), nn.ReLU()) # if opt.use_multi_head == 1: # TODO, not implemented for now # self.attention = MultiHeadedAddAttention(opt.num_heads, opt.d_model, scale=opt.multi_head_scale) if opt.use_multi_head == 2: self.attention = MultiHeadedDotAttention(opt.num_heads, opt.rnn_size, project_k_v=0, scale=opt.multi_head_scale, use_output_layer=0, do_aoa=0, norm_q=1) else: self.attention = Attention(opt) if self.use_ctx_drop: self.ctx_drop = nn.Dropout(self.drop_prob_lm) else: self.ctx_drop = lambda x :x
Example #24
Source File: downsampled_multihead_attention.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def GatedLinear(in_features, out_features, dropout=0., bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features*4, dropout, bias), nn.GLU(), Linear(out_features*2, out_features*2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias) )
Example #25
Source File: fconv_self_att.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', help='use pretrained model when training [True, ...]')
Example #26
Source File: AoAModel.py From AoANet with MIT License | 5 votes |
def __init__(self, h, d_model, dropout=0.1, scale=1, project_k_v=1, use_output_layer=1, do_aoa=0, norm_q=0, dropout_aoa=0.3): super(MultiHeadedDotAttention, self).__init__() assert d_model * scale % h == 0 # We assume d_v always equals d_k self.d_k = d_model * scale // h self.h = h # Do we need to do linear projections on K and V? self.project_k_v = project_k_v # normalize the query? if norm_q: self.norm = LayerNorm(d_model) else: self.norm = lambda x:x self.linears = clones(nn.Linear(d_model, d_model * scale), 1 + 2 * project_k_v) # output linear layer after the multi-head attention? self.output_layer = nn.Linear(d_model * scale, d_model) # apply aoa after attention? self.use_aoa = do_aoa if self.use_aoa: self.aoa_layer = nn.Sequential(nn.Linear((1 + scale) * d_model, 2 * d_model), nn.GLU()) # dropout to the input of AoA layer if dropout_aoa > 0: self.dropout_aoa = nn.Dropout(p=dropout_aoa) else: self.dropout_aoa = lambda x:x if self.use_aoa or not use_output_layer: # AoA doesn't need the output linear layer del self.output_layer self.output_layer = lambda x:x self.attn = None self.dropout = nn.Dropout(p=dropout)
Example #27
Source File: AoAModel.py From AoANet with MIT License | 5 votes |
def __init__(self, opt): super(AoA_Decoder_Core, self).__init__() self.drop_prob_lm = opt.drop_prob_lm self.d_model = opt.rnn_size self.use_multi_head = opt.use_multi_head self.multi_head_scale = opt.multi_head_scale self.use_ctx_drop = getattr(opt, 'ctx_drop', 0) self.out_res = getattr(opt, 'out_res', 0) self.decoder_type = getattr(opt, 'decoder_type', 'AoA') self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size, opt.rnn_size) # we, fc, h^2_t-1 self.out_drop = nn.Dropout(self.drop_prob_lm) if self.decoder_type == 'AoA': # AoA layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, 2 * opt.rnn_size), nn.GLU()) elif self.decoder_type == 'LSTM': # LSTM layer self.att2ctx = nn.LSTMCell(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size) else: # Base linear layer self.att2ctx = nn.Sequential(nn.Linear(self.d_model * opt.multi_head_scale + opt.rnn_size, opt.rnn_size), nn.ReLU()) # if opt.use_multi_head == 1: # TODO, not implemented for now # self.attention = MultiHeadedAddAttention(opt.num_heads, opt.d_model, scale=opt.multi_head_scale) if opt.use_multi_head == 2: self.attention = MultiHeadedDotAttention(opt.num_heads, opt.rnn_size, project_k_v=0, scale=opt.multi_head_scale, use_output_layer=0, do_aoa=0, norm_q=1) else: self.attention = Attention(opt) if self.use_ctx_drop: self.ctx_drop = nn.Dropout(self.drop_prob_lm) else: self.ctx_drop = lambda x :x
Example #28
Source File: GCN.py From graph-2-text with MIT License | 4 votes |
def __init__(self, num_inputs, num_units, num_labels, in_arcs=True, out_arcs=True, batch_first=False, use_gates=True, use_glus=False): super(GCNLayer, self).__init__() self.in_arcs = in_arcs self.out_arcs = out_arcs self.num_inputs = num_inputs self.num_units = num_units self.num_labels = num_labels self.batch_first = batch_first self.glu = nn.GLU(3) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.use_gates = use_gates self.use_glus = use_glus #https://www.cs.toronto.edu/~yujiali/files/talks/iclr16_ggnn_talk.pdf #https://arxiv.org/pdf/1612.08083.pdf if in_arcs: self.V_in = Parameter(torch.Tensor(self.num_inputs, self.num_units)) nn.init.xavier_normal(self.V_in) self.b_in = Parameter(torch.Tensor(num_labels, self.num_units)) nn.init.constant(self.b_in, 0) if self.use_gates: self.V_in_gate = Parameter(torch.Tensor(self.num_inputs, 1)) nn.init.xavier_normal(self.V_in_gate) self.b_in_gate = Parameter(torch.Tensor(num_labels, 1)) nn.init.constant(self.b_in_gate, 1) if out_arcs: self.V_out = Parameter(torch.Tensor(self.num_inputs, self.num_units)) nn.init.xavier_normal(self.V_out) self.b_out = Parameter(torch.Tensor(num_labels, self.num_units)) nn.init.constant(self.b_out, 0) if self.use_gates: self.V_out_gate = Parameter(torch.Tensor(self.num_inputs, 1)) nn.init.xavier_normal(self.V_out_gate) self.b_out_gate = Parameter(torch.Tensor(num_labels, 1)) nn.init.constant(self.b_out_gate, 1) self.W_self_loop = Parameter(torch.Tensor(self.num_inputs, self.num_units)) nn.init.xavier_normal(self.W_self_loop) if self.use_gates: self.W_self_loop_gate = Parameter(torch.Tensor(self.num_inputs, 1)) nn.init.xavier_normal(self.W_self_loop_gate)
Example #29
Source File: lightconv.py From fairseq with MIT License | 4 votes |
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) elif args.decoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
Example #30
Source File: lightconv.py From attn2d with MIT License | 4 votes |
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) elif args.decoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True