Python torch.nn.KLDivLoss() Examples

The following are 30 code examples of torch.nn.KLDivLoss(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.nn , or try the search function .
Example #1
Source File: seq2slate_tf_trainer.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        parameters: Seq2SlateParameters,
        minibatch_size: int,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
    ) -> None:
        self.parameters = parameters
        self.use_gpu = use_gpu
        self.seq2slate_net = seq2slate_net
        self.minibatch_size = minibatch_size
        self.minibatch = 0
        self.optimizer = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters()
        )
        self.kl_div_loss = nn.KLDivLoss(reduction="batchmean") 
Example #2
Source File: seq2slate_dr_trainer.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        parameters: Seq2SlateParameters,
        minibatch_size: int,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
    ) -> None:
        self.parameters = parameters
        self.use_gpu = use_gpu
        self.seq2slate_net = seq2slate_net
        self.minibatch_size = minibatch_size
        self.minibatch = 0
        self.optimizer = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters()
        )
        # TODO: T62269969 add baseline_net in training
        self.kl_div_loss = nn.KLDivLoss(reduction="none") 
Example #3
Source File: seq2slate_attn_trainer.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        loss_reporter=None,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
    ) -> None:
        self.loss_reporter = loss_reporter
        self.use_gpu = use_gpu
        self.seq2slate_net = seq2slate_net
        self.minibatch_size = minibatch_size
        self.minibatch = 0
        self.optimizer = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters()
        )
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.kl_loss = nn.KLDivLoss(reduction="batchmean")
        if self.loss_reporter is None:
            self.loss_reporter = NoOpLossReporter() 
Example #4
Source File: Loss.py    From video-caption-openNMT.pytorch with MIT License 6 votes vote down vote up
def __init__(self, generator, tgt_vocab, normalization="sents",
                 label_smoothing=0.0):
        super(NMTLossCompute, self).__init__(generator, tgt_vocab)
        assert (label_smoothing >= 0.0 and label_smoothing <= 1.0)
        if label_smoothing > 0:
            # When label smoothing is turned on,
            # KL-divergence between q_{smoothed ground truth prob.}(w)
            # and p_{prob. computed by model}(w) is minimized.
            # If label smoothing value is set to zero, the loss
            # is equivalent to NLLLoss or CrossEntropyLoss.
            # All non-true labels are uniformly set to low-confidence.
            self.criterion = nn.KLDivLoss(size_average=False)
            one_hot = torch.randn(1, len(tgt_vocab))
            one_hot.fill_(label_smoothing / (len(tgt_vocab) - 2))
            one_hot[0][self.padding_idx] = 0
            self.register_buffer('one_hot', one_hot)
        else:
            weight = torch.ones(len(tgt_vocab))
            weight[self.padding_idx] = 0
            self.criterion = nn.NLLLoss(weight, size_average=False)
        self.confidence = 1.0 - label_smoothing 
Example #5
Source File: Loss.py    From DC-NeuralConversation with MIT License 6 votes vote down vote up
def __init__(self, generator, tgt_vocab, normalization="sents",
                 label_smoothing=0.0):
        super(NMTLossCompute, self).__init__(generator, tgt_vocab)
        assert (label_smoothing >= 0.0 and label_smoothing <= 1.0)

        if label_smoothing > 0:
            # When label smoothing is turned on,
            # KL-divergence between q_{smoothed ground truth prob.}(w)
            # and p_{prob. computed by model}(w) is minimized.
            # If label smoothing value is set to zero, the loss
            # is equivalent to NLLLoss or CrossEntropyLoss.
            # All non-true labels are uniformly set to low-confidence.
            self.criterion = nn.KLDivLoss(size_average=False)
            one_hot = torch.randn(1, len(tgt_vocab))
            one_hot.fill_(label_smoothing / (len(tgt_vocab) - 2))
            one_hot[0][self.padding_idx] = 0
            self.register_buffer('one_hot', one_hot)
        else:
            weight = torch.ones(len(tgt_vocab))
            weight[self.padding_idx] = 0
            self.criterion = nn.NLLLoss(weight, size_average=False)
        self.confidence = 1.0 - label_smoothing 
Example #6
Source File: Loss.py    From data2text-entity-py with MIT License 6 votes vote down vote up
def __init__(self, generator, tgt_vocab, normalization="sents",
                 label_smoothing=0.0):
        super(NMTLossCompute, self).__init__(generator, tgt_vocab)
        assert (label_smoothing >= 0.0 and label_smoothing <= 1.0)
        if label_smoothing > 0:
            # When label smoothing is turned on,
            # KL-divergence between q_{smoothed ground truth prob.}(w)
            # and p_{prob. computed by model}(w) is minimized.
            # If label smoothing value is set to zero, the loss
            # is equivalent to NLLLoss or CrossEntropyLoss.
            # All non-true labels are uniformly set to low-confidence.
            self.criterion = nn.KLDivLoss(size_average=False)
            one_hot = torch.randn(1, len(tgt_vocab))
            one_hot.fill_(label_smoothing / (len(tgt_vocab) - 2))
            one_hot[0][self.padding_idx] = 0
            self.register_buffer('one_hot', one_hot)
        else:
            weight = torch.ones(len(tgt_vocab))
            weight[self.padding_idx] = 0
            self.criterion = nn.NLLLoss(weight, size_average=False)
        self.confidence = 1.0 - label_smoothing 
Example #7
Source File: label_smoothing_loss.py    From espnet with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        size,
        padding_idx,
        smoothing,
        normalize_length=False,
        criterion=nn.KLDivLoss(reduction="none"),
    ):
        """Construct an LabelSmoothingLoss object."""
        super(LabelSmoothingLoss, self).__init__()
        self.criterion = criterion
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        self.normalize_length = normalize_length 
Example #8
Source File: Loss.py    From reversible-rnn with MIT License 6 votes vote down vote up
def __init__(self, generator, tgt_vocab, label_smoothing=0.0):
        super(NMTLossCompute, self).__init__(generator, tgt_vocab)
        assert (label_smoothing >= 0.0 and label_smoothing <= 1.0)

        self.tgt_vocab_len = len(tgt_vocab)

        if label_smoothing > 0:
            # When label smoothing is turned on,
            # KL-divergence between q_{smoothed ground truth prob.}(w)
            # and p_{prob. computed by model}(w) is minimized.
            # If label smoothing value is set to zero, the loss
            # is equivalent to NLLLoss or CrossEntropyLoss.
            # All non-true labels are uniformly set to low-confidence.
            self.criterion = nn.KLDivLoss(size_average=False)
            one_hot = torch.randn(1, len(tgt_vocab))
            one_hot.fill_(label_smoothing / (len(tgt_vocab) - 2))
            one_hot[0][self.padding_idx] = 0
            self.register_buffer('one_hot', one_hot)
        else:
            weight = torch.ones(len(tgt_vocab))
            weight[self.padding_idx] = 0
            self.criterion = nn.NLLLoss(weight, size_average=False)  # IMPORTANT: NLLLoss is what we use. Interesting that size_average=False
            # ipdb.set_trace()
        self.confidence = 1.0 - label_smoothing 
Example #9
Source File: updater.py    From born_again_neuralnet with MIT License 5 votes vote down vote up
def kd_loss(self, outputs, labels, teacher_outputs, alpha=0.2, T=20):
        KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1),
                                 F.softmax(teacher_outputs/T, dim=1)) * \
            alpha + F.cross_entropy(outputs, labels) * (1. - alpha)

        return KD_loss 
Example #10
Source File: binDeltaLosses.py    From multi-modal-regression with MIT License 5 votes vote down vote up
def __init__(self, alpha):
		super().__init__()
		self.alpha = alpha
		self.mse = nn.MSELoss().cuda()
		self.kl = nn.KLDivLoss().cuda() 
Example #11
Source File: losses.py    From self-critical.pytorch with MIT License 5 votes vote down vote up
def __init__(self, size=0, padding_idx=0, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        # self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        # self.size = size
        self.true_dist = None 
Example #12
Source File: latent_clustering_engine.py    From tatk with Apache License 2.0 5 votes vote down vote up
def __init__(self, model, args, verbose=False):
        super(LatentClusteringEngine, self).__init__(model, args, verbose)
        self.crit = nn.CrossEntropyLoss(reduction='sum')
        self.kldiv = nn.KLDivLoss(reduction='sum')
        self.cluster_crit = nn.NLLLoss(reduction='sum')
        self.sel_crit = Criterion(
            self.model.item_dict,
            bad_toks=['<disconnect>', '<disagree>'],
            reduction='mean' if args.sep_sel else 'none')

        self.sel_model = utils.load_model(args.selection_model_file)
        self.sel_model.eval() 
Example #13
Source File: losses.py    From AlignedReID with MIT License 5 votes vote down vote up
def __init__(self):
        super(KLMutualLoss,self).__init__()
        self.kl_loss = nn.KLDivLoss(size_average=False)
        self.log_softmax = nn.functional.log_softmax
        self.softmax = nn.functional.softmax 
Example #14
Source File: my_loss_function.py    From Teacher-free-Knowledge-Distillation with MIT License 5 votes vote down vote up
def loss_kd_self(outputs, labels, teacher_outputs, params):
    """
    loss function for self training: Tf-KD_{self}
    """
    alpha = params.alpha
    T = params.temperature

    loss_CE = F.cross_entropy(outputs, labels)
    D_KL = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1), F.softmax(teacher_outputs/T, dim=1)) * (T * T) * params.multiplier  # multiple is 1.0 in most of cases, some cases are 10 or 50
    KD_loss =  (1. - alpha)*loss_CE + alpha*D_KL

    return KD_loss 
Example #15
Source File: losses.py    From ImageCaptioning.pytorch with MIT License 5 votes vote down vote up
def __init__(self, size=0, padding_idx=0, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        # self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        # self.size = size
        self.true_dist = None 
Example #16
Source File: affinity_loss.py    From pytorch-loss with MIT License 5 votes vote down vote up
def __init__(self, kl_margin, lambda_edge=1., lambda_not_edge=1., ignore_lb=255):
        super(AffinityFieldLoss, self).__init__()
        self.kl_margin = kl_margin
        self.ignore_lb = ignore_lb
        self.lambda_edge = lambda_edge
        self.lambda_not_edge = lambda_not_edge
        self.kldiv = nn.KLDivLoss(reduction='none') 
Example #17
Source File: my_loss_function.py    From Teacher-free-Knowledge-Distillation with MIT License 5 votes vote down vote up
def loss_kd(outputs, labels, teacher_outputs, params):
    """
    loss function for Knowledge Distillation (KD)
    """
    alpha = params.alpha
    T = params.temperature

    loss_CE = F.cross_entropy(outputs, labels)
    D_KL = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1), F.softmax(teacher_outputs/T, dim=1)) * (T * T)
    KD_loss =  (1. - alpha)*loss_CE + alpha*D_KL

    return KD_loss 
Example #18
Source File: label_smoothing.py    From MTN with MIT License 5 votes vote down vote up
def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None 
Example #19
Source File: hd3losses.py    From hd3 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __call__(self, ms_prob, ms_pred, gt, corr_range, ds=6):
        B, C, H, W = gt.size()
        lv = len(ms_prob)
        criterion = nn.KLDivLoss(reduction='batchmean').cuda()
        losses = {}
        kld_loss = 0
        for l in range(lv):
            scaled_gt, valid_mask = downsample_flow(gt, 1 / 2**(ds - l))
            if self.task == 'stereo':
                scaled_gt = scaled_gt[:, 0, :, :].unsqueeze(1)
            if l > 0:
                scaled_gt = scaled_gt - F.interpolate(
                    ms_pred[l - 1],
                    scale_factor=2,
                    mode='bilinear',
                    align_corners=True)
            scaled_gt = scaled_gt / 2**(ds - l)
            gt_dist = vector2density(scaled_gt, corr_range[l],
                                     self.dim) * valid_mask
            kld_loss += 4**(ds - l) / (H * W) * criterion(
                F.log_softmax(ms_prob[l], dim=1), gt_dist.detach())

        losses['total'] = kld_loss
        for loss_type, loss_value in losses.items():
            losses[loss_type] = loss_value.reshape(1)
        return losses 
Example #20
Source File: model.py    From mrqa with Apache License 2.0 5 votes vote down vote up
def forward_qa(self, input_ids, token_type_ids, attention_mask, start_positions, end_positions, global_step):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        cls_embedding = sequence_output[:, 0]
        if self.concat:
            sep_embedding = self.get_sep_embedding(input_ids, sequence_output)
            hidden = torch.cat([cls_embedding, sep_embedding], dim=1)
        else:
            hidden = sequence_output[:, 0]  # [b, d] : [CLS] representation
        log_prob = self.discriminator(hidden)
        targets = torch.ones_like(log_prob) * (1 / self.num_classes)
        # As with NLLLoss, the input given is expected to contain log-probabilities
        # and is not restricted to a 2D Tensor. The targets are given as probabilities
        kl_criterion = nn.KLDivLoss(reduction="batchmean")
        if self.anneal:
            self.dis_lambda = self.dis_lambda * kl_coef(global_step)
        kld = self.dis_lambda * kl_criterion(log_prob, targets)

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        # If we are on multi-GPU, split add a dimension
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)
        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.size(1)
        start_positions.clamp_(0, ignored_index)
        end_positions.clamp_(0, ignored_index)

        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        qa_loss = (start_loss + end_loss) / 2
        total_loss = qa_loss + kld
        return total_loss 
Example #21
Source File: trainer.py    From nni with MIT License 5 votes vote down vote up
def __init__(self, temperature):
        super().__init__()
        self.temperature = temperature
        # self.kl_loss = nn.KLDivLoss(reduction = 'batchmean')
        self.kl_loss = nn.KLDivLoss() 
Example #22
Source File: loss.py    From joeynmt with Apache License 2.0 5 votes vote down vote up
def __init__(self, pad_index: int, smoothing: float = 0.0):
        super(XentLoss, self).__init__()
        self.smoothing = smoothing
        self.pad_index = pad_index
        if self.smoothing <= 0.0:
            # standard xent loss
            self.criterion = nn.NLLLoss(ignore_index=self.pad_index,
                                        reduction='sum')
        else:
            # custom label-smoothed loss, computed with KL divergence loss
            self.criterion = nn.KLDivLoss(reduction='sum') 
Example #23
Source File: binDeltaLosses.py    From multi-modal-regression with MIT License 5 votes vote down vote up
def __init__(self, alpha, kmeans_file, my_loss):
		super().__init__()
		self.alpha = alpha
		kmeans = pickle.load(open(kmeans_file, 'rb'))
		self.cluster_centers_ = Variable(torch.from_numpy(kmeans.cluster_centers_).float()).cuda()
		self.my_loss = my_loss
		self.kl = nn.KLDivLoss().cuda() 
Example #24
Source File: binDeltaLosses.py    From multi-modal-regression with MIT License 5 votes vote down vote up
def __init__(self, alpha, gmm_file, my_loss):
		super().__init__()
		self.alpha = alpha
		gmm = pickle.load(open(gmm_file, 'rb'))
		self.cluster_centers = Variable(torch.from_numpy(gmm.means_).float()).cuda()
		self.n_clusters = gmm.n_components
		self.my_loss = my_loss
		self.kl = nn.KLDivLoss().cuda() 
Example #25
Source File: binDeltaLosses.py    From multi-modal-regression with MIT License 5 votes vote down vote up
def __init__(self, alpha, kmeans_file, my_loss):
		super().__init__()
		self.alpha = alpha
		kmeans = pickle.load(open(kmeans_file, 'rb'))
		self.cluster_centers = Variable(torch.from_numpy(convert_dictionary(kmeans.cluster_centers_)).float()).cuda()
		self.n_clusters = kmeans.n_clusters
		self.my_loss = my_loss
		self.kl = nn.KLDivLoss().cuda() 
Example #26
Source File: binDeltaLosses.py    From multi-modal-regression with MIT License 5 votes vote down vote up
def __init__(self, alpha, kmeans_file, my_loss=None):
		super().__init__()
		self.alpha = alpha
		kmeans = pickle.load(open(kmeans_file, 'rb'))
		self.cluster_centers = Variable(torch.from_numpy(kmeans.cluster_centers_).float()).cuda()
		self.n_clusters = kmeans.n_clusters
		if my_loss is None:
			self.mse = nn.MSELoss(reduce=False).cuda()
		else:
			self.mse = my_loss
		self.kl = nn.KLDivLoss().cuda() 
Example #27
Source File: __init__.py    From BPT with MIT License 5 votes vote down vote up
def __init__(self, size, smoothing=0.0):
        """Label Smoothing module
        args:
            size: vocab_size
            smoothing: smoothing ratio
        """
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.size = size
        self.smoothing = smoothing 
Example #28
Source File: utils.py    From GoogleConceptualCaptioning with MIT License 5 votes vote down vote up
def __init__(self, size=0, padding_idx=0, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        # self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        # self.size = size
        self.true_dist = None 
Example #29
Source File: ranking_listwise_evaluator.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, seq2slate_net, slate_size: int, calc_cpe: bool) -> None:
        self.seq2slate_net = seq2slate_net
        self.slate_size = slate_size
        self.calc_cpe = calc_cpe
        self.ndcg = []
        self.dcg = []
        self.mean_ap = []
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.kl_loss = nn.KLDivLoss(reduction="batchmean") 
Example #30
Source File: loss.py    From xfer with Apache License 2.0 5 votes vote down vote up
def __init__(self, temperature):
        super(TemperatureScaledKLDivLoss, self).__init__()
        self.temperature = temperature
        self.kullback_leibler_divergence = nn.KLDivLoss(reduction="batchmean")