Python torch.nn.MultiheadAttention() Examples
The following are 15
code examples of torch.nn.MultiheadAttention().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.nn
, or try the search function
.
Example #1
Source File: MReCoSa_RA.py From MultiTurnDialogZoo with MIT License | 9 votes |
def __init__(self, embed_size, hidden_size, output_size, n_layer=2, dropout=0.5, pretrained=None): super(Decoder, self).__init__() self.embed_size, self.hidden_size = embed_size, hidden_size self.output_size = output_size self.n_layer = n_layer self.embed = nn.Embedding(output_size, embed_size) self.rnn = nn.GRU(hidden_size + embed_size, hidden_size, num_layers=n_layer, dropout=(0 if n_layer == 1 else dropout)) self.out = nn.Linear(hidden_size, output_size) self.pos_emb = PositionEmbedding(embed_size, dropout=dropout) self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8) self.layer_norm1 = nn.LayerNorm(embed_size) self.droput1 = nn.Dropout(p=dropout) self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8) self.layer_norm2 = nn.LayerNorm(embed_size) self.droput2 = nn.Dropout(p=dropout) # self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8) # self.layer_norm3 = nn.LayerNorm(embed_size) # self.droput3 = nn.Dropout(p=dropout) self.self_attention = nn.MultiheadAttention(hidden_size, 8) self.word_level_attn = Attention(embed_size) self.init_weight()
Example #2
Source File: MReCoSa.py From MultiTurnDialogZoo with MIT License | 7 votes |
def __init__(self, input_size, embed_size, output_size, utter_hidden, decoder_hidden, teach_force=0.5, pad=1, sos=1, dropout=0.5, utter_n_layer=1, pretrained=None): super(MReCoSa, self).__init__() self.encoder = Encoder(input_size, embed_size, utter_hidden, n_layers=utter_n_layer, dropout=dropout, pretrained=pretrained) self.decoder = Decoder(embed_size, decoder_hidden, output_size, n_layer=utter_n_layer, dropout=dropout, pretrained=pretrained) self.teach_force = teach_force self.pad, self.sos = pad, sos self.output_size = output_size self.pos_emb = PositionEmbedding(embed_size, dropout=dropout) self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8) self.layer_norm1 = nn.LayerNorm(embed_size) self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8) self.layer_norm2 = nn.LayerNorm(embed_size) self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8) self.layer_norm3 = nn.LayerNorm(embed_size)
Example #3
Source File: transformer.py From flambe with MIT License | 6 votes |
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1) -> None: """Initialize a TransformerEncoderLayer. Parameters ---------- d_model : int The number of expected features in the input. n_head : int The number of heads in the multiheadattention models. dim_feedforward : int, optional The dimension of the feedforward network (default=2048). dropout : float, optional The dropout value (default=0.1). """ super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.dropout = nn.Dropout(dropout) self.linear1 = nn.Linear(d_model, dim_feedforward) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout)
Example #4
Source File: transformer.py From flambe with MIT License | 6 votes |
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1) -> None: """Initialize a TransformerDecoder. Parameters ---------- d_model : int The number of expected features in the input. n_head : int The number of heads in the multiheadattention models. dim_feedforward : int, optional The dimension of the feedforward network (default=2048). dropout : float, optional The dropout value (default=0.1). """ super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.dropout = nn.Dropout(dropout) self.linear1 = nn.Linear(d_model, dim_feedforward) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout)
Example #5
Source File: Transformer_OpenAI.py From nlp-experiments-in-pytorch with MIT License | 6 votes |
def __init__(self, embed_dim, num_heads, keep_prob_attention, keep_prob_residual, keep_prob_mlp, n_ctx=512, scale=False, use_builtin_mha=False): if use_builtin_mha: self.attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=keep_prob_attention) else: self.attention = ModifiedMultiHeadedAttention(num_state=embed_dim, n_ctx=n_ctx, num_heads=num_heads, keep_prob_attention=keep_prob_attention, keep_prob_residual=keep_prob_residual, scale=scale) self.layer_norm1 = LayerNorm(embed_dim) self.mlp = MultiLayerPerceptron(4 * embed_dim, embed_dim, keep_prob_mlp) self.layer_norm2 = LayerNorm(embed_dim)
Example #6
Source File: absa_layer.py From BERT-E2E-ABSA with Apache License 2.0 | 5 votes |
def __init__(self, d_model, nhead, dropout=0.1): super(SAN, self).__init__() self.d_model = d_model self.nhead = nhead self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.dropout = nn.Dropout(p=dropout) self.norm = nn.LayerNorm(d_model)
Example #7
Source File: textual_heads.py From virtex with MIT License | 5 votes |
def _init_weights(module): r"""Initialize weights like BERT - N(0.0, 0.02), bias = 0.""" if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.MultiheadAttention): module.in_proj_weight.data.normal_(mean=0.0, std=0.02) module.out_proj.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=0.02) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_()
Example #8
Source File: operations.py From torecsys with MIT License | 5 votes |
def dummy_attention(key : torch.Tensor, query: torch.Tensor, value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: r"""function for dummy in jit-compile features of torch, which have the same inputs and outputs to nn.MultiheadAttention().__call__() Args: key (T): inputs to be passed as output query (T): dummy inputs value (T): dummy inputs Returns: Tuple[T, T]: values = (key, dummy outputs = torch.Tensor([])) """ return key, torch.Tensor([])
Example #9
Source File: operations.py From torecsys with MIT License | 5 votes |
def show_attention(attentions : np.ndarray, xaxis : Union[list, str] = None, yaxis : Union[list, str] = None, savedir : str = None): r"""Show attention of MultiheadAttention in a mpl heatmap Args: attentions (np.ndarray), shape = (sequence length, sequence length), dtype = np.float32: Attentions Weights of output of nn.MultiheadAttention xaxis (str, optional): string or list of xaxis. Defaults to None. yaxis (str, optional): string or list of yaxis. Defaults to None. savedir (str, optional): string of directory to save the attention png. Defaults to None. """ # set up figure with colorbar fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(attentions) fig.colorbar(cax) # set up axes if xaxis is not None: if isinstance(xaxis, str): xaxis = [""] + xaxis.split(",") elif isinstance(xaxis, list): xaxis = [""] + xaxis ax.set_xticklabels(xaxis, rotation=90) if yaxis is not None: if isinstance(yaxis, str): yaxis = [""] + yaxis.split(",") elif isinstance(yaxis, list): yaxis = [""] + yaxis ax.set_yticklabels(yaxis) # show label at every tick ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) if savedir is None: plt.show() else: plt.savefig(savedir)
Example #10
Source File: model.py From fine-grained-sentiment with MIT License | 5 votes |
def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal): super().__init__() self.causal = causal self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim) self.position_embeddings = nn.Embedding(num_max_positions, embed_dim) self.dropout = nn.Dropout(dropout) self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList() self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList() for _ in range(num_layers): self.attentions.append(nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)) self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, embed_dim))) self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12)) self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))
Example #11
Source File: layers.py From MultiTurnDialogZoo with MIT License | 4 votes |
def __init__(self, hidden_size, nhead=8, dropout=0.3): super(Multi_head_attention_trs, self).__init__() self.nhead = nhead self.hidden_size = hidden_size if hidden_size % nhead != 0: raise Exception(f'hidden_size must be divisble by nhead, but got {hidden_size}/{nhead}.') self.multi_head_attention = nn.MultiheadAttention(hidden_size, nhead) self.layer_norm = nn.LayerNorm(hidden_size) self.final_attn = Attention(hidden_size)
Example #12
Source File: MReCoSa.py From MultiTurnDialogZoo with MIT License | 4 votes |
def __init__(self, embed_size, hidden_size, output_size, n_layer=2, dropout=0.5, pretrained=None): super(Decoder, self).__init__() self.embed_size, self.hidden_size = embed_size, hidden_size self.output_size = output_size self.n_layer = n_layer self.embed = nn.Embedding(output_size, embed_size) self.rnn = nn.GRU(hidden_size + embed_size, hidden_size, num_layers=n_layer, dropout=(0 if n_layer == 1 else dropout)) self.out = nn.Linear(hidden_size, output_size) self.init_weight() self.self_attention = nn.MultiheadAttention(hidden_size, 8)
Example #13
Source File: transformer_sru.py From flambe with MIT License | 4 votes |
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, sru_dropout: Optional[float] = None, bidirectional: bool = False, **kwargs: Dict[str, Any]) -> None: """Initialize a TransformerSRUEncoderLayer. Parameters ---------- d_model : int The number of expected features in the input. n_head : int The number of heads in the multiheadattention models. dim_feedforward : int, optional The dimension of the feedforward network (default=2048). dropout : float, optional The dropout value (default=0.1). sru_dropout: float, optional Dropout for the SRU cell. If not given, uses the same dropout value as the rest of the transformer. bidirectional: bool Whether the SRU module should be bidrectional. Defaul ``False``. Extra keyword arguments are passed to the SRUCell. """ super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.sru = SRUCell(d_model, dim_feedforward, dropout, sru_dropout or dropout, bidirectional=bidirectional, has_skip_term=False, **kwargs) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout)
Example #14
Source File: transformer_sru.py From flambe with MIT License | 4 votes |
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, sru_dropout: Optional[float] = None, **kwargs: Dict[str, Any]) -> None: """Initialize a TransformerDecoder. Parameters ---------- d_model : int The number of expected features in the input. n_head : int The number of heads in the multiheadattention models. dim_feedforward : int, optional The dimension of the feedforward network (default=2048). dropout : float, optional The dropout value (default=0.1). sru_dropout: float, optional Dropout for the SRU cell. If not given, uses the same dropout value as the rest of the transformer. Extra keyword arguments are passed to the SRUCell. """ super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.sru = SRUCell(d_model, dim_feedforward, dropout, sru_dropout or dropout, bidirectional=False, has_skip_term=False, **kwargs) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout)
Example #15
Source File: dp_multihead_attention.py From pytorch-dp with Apache License 2.0 | 4 votes |
def load_state_dict(self, state_dict): """ Loads module from previously saved state. Supports loading from both DPMultiheadAttention and nn.MultiheadAttention modules """ if "in_proj_weight" in state_dict: qweight, kweight, vweight = state_dict["in_proj_weight"].chunk(3, dim=0) state_dict["qlinear.weight"] = qweight state_dict["klinear.weight"] = kweight state_dict["vlinear.weight"] = vweight del state_dict["in_proj_weight"] if "in_proj_bias" in state_dict: qbias, kbias, vbias = state_dict["in_proj_bias"].chunk(3, dim=0) state_dict["qlinear.bias"] = qbias state_dict["klinear.bias"] = kbias state_dict["vlinear.bias"] = vbias del state_dict["in_proj_bias"] if "bias_k" in state_dict: state_dict["seq_bias_k.bias"] = state_dict["bias_k"].squeeze() del state_dict["bias_k"] if "bias_v" in state_dict: state_dict["seq_bias_v.bias"] = state_dict["bias_v"].squeeze() del state_dict["bias_v"] if "q_proj_weight" in state_dict: state_dict["qlinear.weight"] = state_dict["q_proj_weight"] del state_dict["q_proj_weight"] if "k_proj_weight" in state_dict: state_dict["klinear.weight"] = state_dict["k_proj_weight"] del state_dict["k_proj_weight"] if "v_proj_weight" in state_dict: state_dict["vlinear.weight"] = state_dict["v_proj_weight"] del state_dict["v_proj_weight"] super(DPMultiheadAttention, self).load_state_dict(state_dict)