Python torch.nn.LayerNorm() Examples

The following are 30 code examples of torch.nn.LayerNorm(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.nn , or try the search function .
Example #1
Source File: scorenet.py    From ncsn with GNU General Public License v3.0 7 votes vote down vote up
def __init__(self, config):
        super().__init__()
        self.config = config
        self.main = nn.Sequential(
            nn.Linear(10 * 10, 1024),
            nn.LayerNorm(1024),
            nn.ELU(),
            nn.Linear(1024, 1024),
            nn.LayerNorm(1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ELU(),
            nn.Linear(512, 100),
            nn.LayerNorm(100)
        ) 
Example #2
Source File: modules.py    From dgl with Apache License 2.0 7 votes vote down vote up
def __init__(self,
                 in_feats,
                 out_feats,
                 activation,
                 dropout,
                 bias=True,
                 use_pp=False,
                 use_lynorm=True):
        super(GraphSAGELayer, self).__init__()
        # The input feature size gets doubled as we concatenated the original
        # features with the new features.
        self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias)
        self.activation = activation
        self.use_pp = use_pp
        if dropout:
            self.dropout = nn.Dropout(p=dropout)
        else:
            self.dropout = 0.
        if use_lynorm:
            self.lynorm = nn.LayerNorm(out_feats, elementwise_affine=True)
        else:
            self.lynorm = lambda x: x
        self.reset_parameters() 
Example #3
Source File: model.py    From dgl with Apache License 2.0 6 votes vote down vote up
def __init__(self, c, T, n, Lk, p, num_layers,control_str = 'TNTSTNTST'):
        super(STGCN_WAVE, self).__init__()
        self.control_str = control_str # model structure controller
        self.num_layers = len(control_str)
        self.layers = []
        cnt = 0
        diapower = 0
        for i in range(self.num_layers):
            i_layer = control_str[i]
            if i_layer == 'T': # Temporal Layer
                self.layers.append(TemporalConvLayer(c[cnt], c[cnt + 1], dia = 2**diapower))
                diapower += 1
                cnt += 1
            if i_layer == 'S': # Spatio Layer
                self.layers.append(SpatioConvLayer(c[cnt], Lk))
            if i_layer == 'N': # Norm Layer
                self.layers.append(nn.LayerNorm([n,c[cnt]]))
        self.output = OutputLayer(c[cnt], T + 1 - 2**(diapower), n)
        for layer in self.layers:
            layer = layer.cuda() 
Example #4
Source File: Transformer.py    From ConvLab with MIT License 6 votes vote down vote up
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(AverageHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout) 
Example #5
Source File: Transformer.py    From ConvLab with MIT License 6 votes vote down vote up
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout) 
Example #6
Source File: attention.py    From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, d_model, d_k, d_v, h, dropout=.1, identity_map_reordering=False, can_be_stateful=False,
                 attention_module=None, attention_module_kwargs=None):
        super(MultiHeadAttention, self).__init__()
        self.identity_map_reordering = identity_map_reordering
        if attention_module is not None:
            if attention_module_kwargs is not None:
                self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h, **attention_module_kwargs)
            else:
                self.attention = attention_module(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
        else:
            self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.can_be_stateful = can_be_stateful
        if self.can_be_stateful:
            self.register_state('running_keys', torch.zeros((0, d_model)))
            self.register_state('running_values', torch.zeros((0, d_model))) 
Example #7
Source File: encoder.py    From TVQAplus with MIT License 6 votes vote down vote up
def __init__(self, n_conv, kernel_size=7, n_filters=128, dropout=0.1, num_heads=4):
        super(EncoderBlock, self).__init__()
        self.dropout = dropout
        self.n_conv = n_conv
        self.num_heads = num_heads

        self.position_encoding = PositionEncoding(n_filters=n_filters)

        self.layer_norm = nn.ModuleList([nn.LayerNorm(n_filters) for _ in range(n_conv)])
        self.final_layer_norm = nn.LayerNorm(n_filters)
        self.conv = nn.ModuleList([
            DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
            for _ in range(n_conv)])

        if self.num_heads != 0:
            self.multi_head_attn = MultiHeadedAttention(nh=num_heads, d_model=n_filters)
            self.attn_layer_norm = nn.LayerNorm(n_filters) 
Example #8
Source File: encoder.py    From pytorch_sac_ae with MIT License 6 votes vote down vote up
def __init__(self, obs_shape, feature_dim, num_layers=2, num_filters=32):
        super().__init__()

        assert len(obs_shape) == 3

        self.feature_dim = feature_dim
        self.num_layers = num_layers

        self.convs = nn.ModuleList(
            [nn.Conv2d(obs_shape[0], num_filters, 3, stride=2)]
        )
        for i in range(num_layers - 1):
            self.convs.append(nn.Conv2d(num_filters, num_filters, 3, stride=1))

        out_dim = OUT_DIM[num_layers]
        self.fc = nn.Linear(num_filters * out_dim * out_dim, self.feature_dim)
        self.ln = nn.LayerNorm(self.feature_dim)

        self.outputs = dict() 
Example #9
Source File: meta.py    From ScenarioMeta with MIT License 6 votes vote down vote up
def __init__(self, hidden_size, layer_norm=False, input_gate=True, forget_gate=True):
            nn.Module.__init__(self)
            self.hidden_size = hidden_size
            # gradient(2), param(2), loss
            self.lstm = nn.LSTMCell(input_size=5, hidden_size=hidden_size)
            if layer_norm:
                self.layer_norm = nn.LayerNorm(hidden_size)
            else:
                self.layer_norm = None
            self.input_gate = input_gate
            self.forget_gate = forget_gate
            if self.input_gate:
                self.lr_layer = nn.Linear(hidden_size, 1)
                self.lrs = []
            else:
                self.output_layer = nn.Linear(hidden_size, 1)
                self.dets = []
            if forget_gate:
                self.fg_layer = nn.Linear(hidden_size, 1)
                self.fgs = []
            self.h_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True))
            self.c_0 = nn.Parameter(torch.randn((hidden_size,), requires_grad=True)) 
Example #10
Source File: transformer_blocks.py    From Character-Level-Language-Modeling-with-Deeper-Self-Attention-pytorch with MIT License 6 votes vote down vote up
def __init__(self, input_size, inner_linear, inner_groups=1, layer_norm=True, weight_norm=False, dropout=0, batch_first=True):
        super(AverageNetwork, self).__init__()
        wn_func = wn if weight_norm else lambda x: x
        self.input_size = input_size
        self.time_step = 0
        self.batch_dim, self.time_dim = (0, 1) if batch_first else (1, 0)
        self.gates = nn.Sequential(
            wn_func(nn.Linear(2 * input_size, 2 * input_size)),
            nn.Sigmoid()
        )
        if layer_norm:
            self.lnorm = nn.LayerNorm(input_size)
        self.fc = nn.Sequential(wn_func(Linear(input_size, inner_linear, groups=inner_groups)),
                                nn.ReLU(inplace=True),
                                nn.Dropout(dropout),
                                wn_func(Linear(inner_linear, input_size, groups=inner_groups))) 
Example #11
Source File: transformer.py    From crosentgec with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, args):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            self.embed_dim, args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.normalize_before = args.decoder_normalize_before
        self.encoder_attn = MultiheadAttention(
            self.embed_dim, args.decoder_attention_heads,
            dropout=args.attention_dropout,
        )
        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for i in range(3)]) 
Example #12
Source File: absa_layer.py    From BERT-E2E-ABSA with Apache License 2.0 6 votes vote down vote up
def __init__(self, input_size, hidden_size, bidirectional=True):
        """

        :param input_size:
        :param hidden_size:
        :param bidirectional:
        """
        super(LSTM, self).__init__()
        self.input_size = input_size
        if bidirectional:
            self.hidden_size = hidden_size // 2
        else:
            self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.LNx = nn.LayerNorm(4*self.hidden_size)
        self.LNh = nn.LayerNorm(4*self.hidden_size)
        self.LNc = nn.LayerNorm(self.hidden_size)
        self.Wx = nn.Linear(in_features=self.input_size, out_features=4*self.hidden_size, bias=True)
        self.Wh = nn.Linear(in_features=self.hidden_size, out_features=4*self.hidden_size, bias=True) 
Example #13
Source File: utils.py    From cortex with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def finish_layer_1d(models, name, dim_out,
                    dropout=False, layer_norm=False, batch_norm=False,
                    nonlinearity=None):
    if layer_norm and batch_norm:
        logger.warning('Ignoring layer_norm because batch_norm is True')

    if dropout:
        models.add_module(name + '_do', nn.Dropout(p=dropout))

    if layer_norm:
        models.add_module(name + '_ln', nn.LayerNorm(dim_out))
    elif batch_norm:
        models.add_module(name + '_bn', nn.BatchNorm1d(dim_out))

    if nonlinearity:
        nonlinearity = get_nonlinearity(nonlinearity)
        models.add_module(
            '{}_{}'.format(name, nonlinearity.__class__.__name__),
            nonlinearity) 
Example #14
Source File: utils.py    From cortex with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def finish_layer_2d(models, name, dim_x, dim_y, dim_out,
                    dropout=False, layer_norm=False, batch_norm=False,
                    nonlinearity=None):
    if layer_norm and batch_norm:
        logger.warning('Ignoring layer_norm because batch_norm is True')

    if dropout:
        models.add_module(name + '_do', nn.Dropout2d(p=dropout))

    if layer_norm:
        models.add_module(name + '_ln', nn.LayerNorm((dim_out, dim_x, dim_y)))
    elif batch_norm:
        models.add_module(name + '_bn', nn.BatchNorm2d(dim_out))

    if nonlinearity:
        nonlinearity = get_nonlinearity(nonlinearity)
        models.add_module(
            '{}_{}'.format(name, nonlinearity.__class__.__name__),
            nonlinearity) 
Example #15
Source File: absa_layer.py    From BERT-E2E-ABSA with Apache License 2.0 6 votes vote down vote up
def __init__(self, input_size, hidden_size, bidirectional=True):
        """

        :param input_size:
        :param hidden_size:
        :param bidirectional:
        """
        super(GRU, self).__init__()
        self.input_size = input_size
        if bidirectional:
            self.hidden_size = hidden_size // 2
        else:
            self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.Wxrz = nn.Linear(in_features=self.input_size, out_features=2*self.hidden_size, bias=True)
        self.Whrz = nn.Linear(in_features=self.hidden_size, out_features=2*self.hidden_size, bias=True)
        self.Wxn = nn.Linear(in_features=self.input_size, out_features=self.hidden_size, bias=True)
        self.Whn = nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size, bias=True)
        self.LNx1 = nn.LayerNorm(2*self.hidden_size)
        self.LNh1 = nn.LayerNorm(2*self.hidden_size)
        self.LNx2 = nn.LayerNorm(self.hidden_size)
        self.LNh2 = nn.LayerNorm(self.hidden_size) 
Example #16
Source File: models.py    From cerl with Apache License 2.0 6 votes vote down vote up
def __init__(self, state_dim, action_dim, wwid):
        super(Actor, self).__init__()

        self.wwid = torch.Tensor([wwid])
        l1 = 400; l2 = 300

        # Construct Hidden Layer 1
        self.f1 = nn.Linear(state_dim, l1)
        self.ln1 = nn.LayerNorm(l1)

        #Hidden Layer 2
        self.f2 = nn.Linear(l1, l2)
        self.ln2 = nn.LayerNorm(l2)

        #Out
        self.w_out = nn.Linear(l2, action_dim) 
Example #17
Source File: module.py    From Transformer-TTS with MIT License 6 votes vote down vote up
def __init__(self, num_hidden, h=4):
        """
        :param num_hidden: dimension of hidden
        :param h: num of heads 
        """
        super(Attention, self).__init__()

        self.num_hidden = num_hidden
        self.num_hidden_per_attn = num_hidden // h
        self.h = h

        self.key = Linear(num_hidden, num_hidden, bias=False)
        self.value = Linear(num_hidden, num_hidden, bias=False)
        self.query = Linear(num_hidden, num_hidden, bias=False)

        self.multihead = MultiheadAttention(self.num_hidden_per_attn)

        self.residual_dropout = nn.Dropout(p=0.1)

        self.final_linear = Linear(num_hidden * 2, num_hidden)

        self.layer_norm_1 = nn.LayerNorm(num_hidden) 
Example #18
Source File: transformer.py    From ITDD with MIT License 5 votes vote down vote up
def __init__(self, num_layers, d_model, heads, d_ff, attn_type,
                 copy_attn, self_attn_type, dropout, embeddings):
        super(TransformerDecoder, self).__init__()

        # Basic attributes.
        self.decoder_type = 'transformer'
        self.num_layers = num_layers
        self.embeddings = embeddings
        self.self_attn_type = self_attn_type

        # Decoder State
        self.state = {}

        # Build TransformerDecoder.
        self.transformer_layers = nn.ModuleList(
            [TransformerDecoderLayer(d_model, heads, d_ff, dropout,
             self_attn_type=self_attn_type)
             for _ in range(num_layers)])

        # TransformerDecoder has its own attention mechanism.
        # Set up a separated copy attention layer, if needed.
        self._copy = False
        if copy_attn:
            self.copy_attn = onmt.modules.GlobalAttention(
                d_model, attn_type=attn_type)
            self._copy = True
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 
Example #19
Source File: transformer.py    From ITDD with MIT License 5 votes vote down vote up
def __init__(self, num_layers, d_model, heads, d_ff,
                 dropout, embeddings):
        super(TransformerEncoder, self).__init__()

        self.num_layers = num_layers
        self.embeddings = embeddings
        self.transformer = nn.ModuleList(
            [TransformerEncoderLayer(d_model, heads, d_ff, dropout)
             for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) 
Example #20
Source File: transformer.py    From ITDD with MIT License 5 votes vote down vote up
def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout) 
Example #21
Source File: position_ffn.py    From ITDD with MIT License 5 votes vote down vote up
def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout_1 = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.dropout_2 = nn.Dropout(dropout) 
Example #22
Source File: utils.py    From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, d_model=512, d_ff=2048, dropout=.1, identity_map_reordering=False):
        super(PositionWiseFeedForward, self).__init__()
        self.identity_map_reordering = identity_map_reordering
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(p=dropout)
        self.dropout_2 = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model) 
Example #23
Source File: layer_norm.py    From fairseq with MIT License 5 votes vote down vote up
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
    if not export and torch.cuda.is_available() and has_fused_layernorm:
        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) 
Example #24
Source File: dummy_model.py    From fairseq with MIT License 5 votes vote down vote up
def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
        super().__init__(Dictionary())
        self.embed = nn.Embedding(
            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
        )
        self.layers_a = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 3*embed_dim),  # q, k, v input projection
                nn.Linear(3*embed_dim, embed_dim),  # skip self-attention
                nn.Linear(embed_dim, embed_dim),    # output projection
                nn.Dropout(),
            )
            for i in range(num_layers)
        ])
        self.layers_b = nn.ModuleList([
            nn.Sequential(
                nn.LayerNorm(embed_dim),
                nn.Linear(embed_dim, 4*embed_dim),  # FFN
                nn.ReLU(),
                nn.Linear(4*embed_dim, embed_dim),  # FFN
                nn.Dropout(0.1),
            )
            for i in range(num_layers)
        ])
        self.out_proj = nn.Linear(embed_dim, num_embed) 
Example #25
Source File: vggtransformer.py    From fairseq with MIT License 5 votes vote down vote up
def LayerNorm(embedding_dim):
    m = nn.LayerNorm(embedding_dim)
    return m


# seq2seq models 
Example #26
Source File: encoders.py    From meshed-memory-transformer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, N, padding_idx, d_in=2048, **kwargs):
        super(MemoryAugmentedEncoder, self).__init__(N, padding_idx, **kwargs)
        self.fc = nn.Linear(d_in, self.d_model)
        self.dropout = nn.Dropout(p=self.dropout)
        self.layer_norm = nn.LayerNorm(self.d_model) 
Example #27
Source File: module.py    From Transformer-TTS with MIT License 5 votes vote down vote up
def __init__(self, num_hidden):
        """
        :param num_hidden: dimension of hidden 
        """
        super(FFN, self).__init__()
        self.w_1 = Conv(num_hidden, num_hidden * 4, kernel_size=1, w_init='relu')
        self.w_2 = Conv(num_hidden * 4, num_hidden, kernel_size=1)
        self.dropout = nn.Dropout(p=0.1)
        self.layer_norm = nn.LayerNorm(num_hidden) 
Example #28
Source File: layers.py    From dgl with Apache License 2.0 5 votes vote down vote up
def __init__(self, size, dropout):
        super(SubLayerWrapper, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout) 
Example #29
Source File: scorenet.py    From ncsn with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, config):
        super().__init__()
        self.config = config
        nef = config.model.nef * 4
        self.u_net = nn.Sequential(
            # input is (nc) x 10 x 10
            nn.Conv2d(config.data.channels, nef, 4, stride=2, padding=1),
            # nn.Softplus(),
            nn.GroupNorm(4, nef),
            nn.ELU(),
            # state size. (nef) x 6 x 6
            nn.Conv2d(nef, nef * 2, 3, stride=1, padding=1),
            nn.GroupNorm(4, nef * 2),
            # nn.Softplus(),
            nn.ELU(),
            # state size. (nef*2) x 6 x 6
            nn.ConvTranspose2d(nef * 2, nef, 3, stride=1, padding=1),
            nn.GroupNorm(4, nef),
            # nn.Softplus(),
            nn.ELU(),
            # state size. (nef*2) x 6 x 6
            nn.ConvTranspose2d(nef, config.data.channels, 4, stride=2, padding=1),
            # nn.Softplus(),
            nn.ELU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(config.data.channels * 10 ** 2, 256),
            nn.LayerNorm(256),
            nn.ELU(),
            nn.Linear(256, config.data.channels * 10 ** 2)
        ) 
Example #30
Source File: ktransformer.py    From ITDD with MIT License 5 votes vote down vote up
def __init__(self, d_model, heads, d_ff, dropout):
        super(HTransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.knowledge_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)