Python torch.nn.MultiheadAttention() Examples

The following are 15 code examples of torch.nn.MultiheadAttention(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.nn , or try the search function

Example #1

Source File: MReCoSa_RA.py From MultiTurnDialogZoo with MIT License

9 votes

def __init__(self, embed_size, hidden_size, output_size, n_layer=2, dropout=0.5, pretrained=None):
        super(Decoder, self).__init__()
        self.embed_size, self.hidden_size = embed_size, hidden_size
        self.output_size = output_size
        self.n_layer = n_layer

        self.embed = nn.Embedding(output_size, embed_size)
        self.rnn = nn.GRU(hidden_size + embed_size, hidden_size, 
                          num_layers=n_layer, dropout=(0 if n_layer == 1 else dropout))
        self.out = nn.Linear(hidden_size, output_size)
        self.pos_emb = PositionEmbedding(embed_size, dropout=dropout)
        self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8)
        self.layer_norm1 = nn.LayerNorm(embed_size)
        self.droput1 = nn.Dropout(p=dropout)
        self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8)
        self.layer_norm2 = nn.LayerNorm(embed_size)
        self.droput2 = nn.Dropout(p=dropout)
        # self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8)
        # self.layer_norm3 = nn.LayerNorm(embed_size)
        # self.droput3 = nn.Dropout(p=dropout)
        
        self.self_attention = nn.MultiheadAttention(hidden_size, 8)
        self.word_level_attn = Attention(embed_size)
        self.init_weight()

Example #2

Source File: MReCoSa.py From MultiTurnDialogZoo with MIT License

7 votes

def __init__(self, input_size, embed_size, output_size, utter_hidden, 
                 decoder_hidden, teach_force=0.5, pad=1, sos=1, dropout=0.5, 
                 utter_n_layer=1, pretrained=None):
        super(MReCoSa, self).__init__()
        self.encoder = Encoder(input_size, embed_size, utter_hidden, n_layers=utter_n_layer,
                               dropout=dropout, pretrained=pretrained)
        self.decoder = Decoder(embed_size, decoder_hidden, output_size, n_layer=utter_n_layer,
                               dropout=dropout, pretrained=pretrained)
        self.teach_force = teach_force
        self.pad, self.sos = pad, sos
        self.output_size = output_size
        self.pos_emb = PositionEmbedding(embed_size, dropout=dropout)
        self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8)
        self.layer_norm1 = nn.LayerNorm(embed_size)
        self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8)
        self.layer_norm2 = nn.LayerNorm(embed_size)
        self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8)
        self.layer_norm3 = nn.LayerNorm(embed_size)

Example #3

Source File: transformer.py From flambe with MIT License

6 votes

def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1) -> None:
        """Initialize a TransformerEncoderLayer.

        Parameters
        ----------
        d_model : int
            The number of expected features in the input.
        n_head : int
            The number of heads in the multiheadattention models.
        dim_feedforward : int, optional
            The dimension of the feedforward network (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).

        """
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)

        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

Example #4

Source File: transformer.py From flambe with MIT License

6 votes

def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1) -> None:
        """Initialize a TransformerDecoder.

        Parameters
        ----------
        d_model : int
            The number of expected features in the input.
        n_head : int
            The number of heads in the multiheadattention models.
        dim_feedforward : int, optional
            The dimension of the feedforward network (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).

        """
        super().__init__()

        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)

        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

Example #5

Source File: Transformer_OpenAI.py From nlp-experiments-in-pytorch with MIT License

6 votes

def __init__(self, embed_dim, num_heads, keep_prob_attention, keep_prob_residual, keep_prob_mlp, n_ctx=512,
                 scale=False, use_builtin_mha=False):
        if use_builtin_mha:
            self.attention = nn.MultiheadAttention(embed_dim=embed_dim,
                                                   num_heads=num_heads,
                                                   dropout=keep_prob_attention)
        else:
            self.attention = ModifiedMultiHeadedAttention(num_state=embed_dim,
                                                          n_ctx=n_ctx,
                                                          num_heads=num_heads,
                                                          keep_prob_attention=keep_prob_attention,
                                                          keep_prob_residual=keep_prob_residual,
                                                          scale=scale)
        self.layer_norm1 = LayerNorm(embed_dim)
        self.mlp = MultiLayerPerceptron(4 * embed_dim, embed_dim, keep_prob_mlp)
        self.layer_norm2 = LayerNorm(embed_dim)

Example #6

Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0

5 votes

def __init__(self, d_model, nhead, dropout=0.1):
        super(SAN, self).__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
        self.norm = nn.LayerNorm(d_model)

Example #7

Source File: textual_heads.py From virtex with MIT License

5 votes

def _init_weights(module):
        r"""Initialize weights like BERT - N(0.0, 0.02), bias = 0."""

        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
        elif isinstance(module, nn.MultiheadAttention):
            module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
            module.out_proj.weight.data.normal_(mean=0.0, std=0.02)
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

Example #8

Source File: operations.py From torecsys with MIT License

5 votes

def dummy_attention(key  : torch.Tensor, 
                    query: torch.Tensor, 
                    value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    r"""function for dummy in jit-compile features of torch, which have the same inputs and 
    outputs to nn.MultiheadAttention().__call__()
    
    Args:
        key (T): inputs to be passed as output
        query (T): dummy inputs
        value (T): dummy inputs
    
    Returns:
        Tuple[T, T]: values = (key, dummy outputs = torch.Tensor([]))
    """
    return key, torch.Tensor([])

Example #9

Source File: operations.py From torecsys with MIT License

5 votes

def show_attention(attentions : np.ndarray, 
                   xaxis      : Union[list, str] = None, 
                   yaxis      : Union[list, str] = None, 
                   savedir    : str = None):
    r"""Show attention of MultiheadAttention in a mpl heatmap
    
    Args:
        attentions (np.ndarray), shape = (sequence length, sequence length), dtype = np.float32: Attentions Weights of output of nn.MultiheadAttention
        xaxis (str, optional): string or list of xaxis. Defaults to None.
        yaxis (str, optional): string or list of yaxis. Defaults to None.
        savedir (str, optional): string of directory to save the attention png. Defaults to None.
    """
    # set up figure with colorbar
    fig = plt.figure()
    ax  = fig.add_subplot(111)
    cax = ax.matshow(attentions)
    fig.colorbar(cax)

    # set up axes
    if xaxis is not None:
        if isinstance(xaxis, str):
            xaxis = [""] + xaxis.split(",")
        elif isinstance(xaxis, list):
            xaxis = [""] + xaxis
        ax.set_xticklabels(xaxis, rotation=90)
    
    if yaxis is not None:
        if isinstance(yaxis, str):
            yaxis = [""] + yaxis.split(",")
        elif isinstance(yaxis, list):
            yaxis = [""] + yaxis
        ax.set_yticklabels(yaxis)
    
    # show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    if savedir is None:
        plt.show()
    else:
        plt.savefig(savedir)

Example #10

Source File: model.py From fine-grained-sentiment with MIT License

5 votes

def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal):
        super().__init__()
        self.causal = causal
        self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
        self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
        self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
        for _ in range(num_layers):
            self.attentions.append(nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout))
            self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
                                                    nn.ReLU(),
                                                    nn.Linear(hidden_dim, embed_dim)))
            self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12))
            self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))

Example #11

Source File: layers.py From MultiTurnDialogZoo with MIT License

4 votes

def __init__(self, hidden_size, nhead=8, dropout=0.3):
        super(Multi_head_attention_trs, self).__init__()
        self.nhead = nhead
        self.hidden_size = hidden_size
        
        if hidden_size % nhead != 0:
            raise Exception(f'hidden_size must be divisble by nhead, but got {hidden_size}/{nhead}.')
        
        self.multi_head_attention = nn.MultiheadAttention(hidden_size, nhead)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.final_attn = Attention(hidden_size)

Example #12

Source File: MReCoSa.py From MultiTurnDialogZoo with MIT License

4 votes

def __init__(self, embed_size, hidden_size, output_size, n_layer=2, dropout=0.5, pretrained=None):
        super(Decoder, self).__init__()
        self.embed_size, self.hidden_size = embed_size, hidden_size
        self.output_size = output_size
        self.n_layer = n_layer

        self.embed = nn.Embedding(output_size, embed_size)
        self.rnn = nn.GRU(hidden_size + embed_size, hidden_size, 
                          num_layers=n_layer, dropout=(0 if n_layer == 1 else dropout))
        self.out = nn.Linear(hidden_size, output_size)

        self.init_weight()
        self.self_attention = nn.MultiheadAttention(hidden_size, 8)

Example #13

Source File: transformer_sru.py From flambe with MIT License

4 votes

def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 sru_dropout: Optional[float] = None,
                 bidirectional: bool = False,
                 **kwargs: Dict[str, Any]) -> None:
        """Initialize a TransformerSRUEncoderLayer.

        Parameters
        ----------
        d_model : int
            The number of expected features in the input.
        n_head : int
            The number of heads in the multiheadattention models.
        dim_feedforward : int, optional
            The dimension of the feedforward network (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).
        sru_dropout: float, optional
            Dropout for the SRU cell. If not given, uses the same
            dropout value as the rest of the transformer.
        bidirectional: bool
            Whether the SRU module should be bidrectional.
            Defaul ``False``.

        Extra keyword arguments are passed to the SRUCell.

        """
        super().__init__()

        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.sru = SRUCell(d_model,
                           dim_feedforward,
                           dropout,
                           sru_dropout or dropout,
                           bidirectional=bidirectional,
                           has_skip_term=False, **kwargs)

        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

Example #14

Source File: transformer_sru.py From flambe with MIT License

4 votes

def __init__(self,
                 d_model: int,
                 nhead: int,
                 dim_feedforward: int = 2048,
                 dropout: float = 0.1,
                 sru_dropout: Optional[float] = None,
                 **kwargs: Dict[str, Any]) -> None:
        """Initialize a TransformerDecoder.

        Parameters
        ----------
        d_model : int
            The number of expected features in the input.
        n_head : int
            The number of heads in the multiheadattention models.
        dim_feedforward : int, optional
            The dimension of the feedforward network (default=2048).
        dropout : float, optional
            The dropout value (default=0.1).
        sru_dropout: float, optional
            Dropout for the SRU cell. If not given, uses the same
            dropout value as the rest of the transformer.

        Extra keyword arguments are passed to the SRUCell.

        """
        super().__init__()

        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.sru = SRUCell(d_model,
                           dim_feedforward,
                           dropout,
                           sru_dropout or dropout,
                           bidirectional=False,
                           has_skip_term=False, **kwargs)

        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

Example #15

Source File: dp_multihead_attention.py From pytorch-dp with Apache License 2.0

4 votes

def load_state_dict(self, state_dict):
        """ Loads module from previously saved state.
        Supports loading from both DPMultiheadAttention
        and nn.MultiheadAttention modules
        """
        if "in_proj_weight" in state_dict:
            qweight, kweight, vweight = state_dict["in_proj_weight"].chunk(3, dim=0)

            state_dict["qlinear.weight"] = qweight
            state_dict["klinear.weight"] = kweight
            state_dict["vlinear.weight"] = vweight
            del state_dict["in_proj_weight"]

        if "in_proj_bias" in state_dict:
            qbias, kbias, vbias = state_dict["in_proj_bias"].chunk(3, dim=0)

            state_dict["qlinear.bias"] = qbias
            state_dict["klinear.bias"] = kbias
            state_dict["vlinear.bias"] = vbias
            del state_dict["in_proj_bias"]

        if "bias_k" in state_dict:
            state_dict["seq_bias_k.bias"] = state_dict["bias_k"].squeeze()
            del state_dict["bias_k"]

        if "bias_v" in state_dict:
            state_dict["seq_bias_v.bias"] = state_dict["bias_v"].squeeze()
            del state_dict["bias_v"]

        if "q_proj_weight" in state_dict:
            state_dict["qlinear.weight"] = state_dict["q_proj_weight"]
            del state_dict["q_proj_weight"]

        if "k_proj_weight" in state_dict:
            state_dict["klinear.weight"] = state_dict["k_proj_weight"]
            del state_dict["k_proj_weight"]

        if "v_proj_weight" in state_dict:
            state_dict["vlinear.weight"] = state_dict["v_proj_weight"]
            del state_dict["v_proj_weight"]

        super(DPMultiheadAttention, self).load_state_dict(state_dict)