Python nltk.translate.bleu_score.SmoothingFunction() Examples

The following are 30 code examples of nltk.translate.bleu_score.SmoothingFunction(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.translate.bleu_score , or try the search function .
Example #1
Source File: metrics.py    From ParlAI with MIT License 8 votes vote down vote up
def compute(guess: str, answers: List[str], k: int = 4) -> Optional['BleuMetric']:
        """
        Compute approximate BLEU score between guess and a set of answers.
        """
        if nltkbleu is None:
            # bleu library not installed, just return a default value
            return None
        # Warning: BLEU calculation *should* include proper tokenization and
        # punctuation etc. We're using the normalize_answer for everything though,
        # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
        # going to be slower than fairseq's (which is written in C), but fairseq's
        # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
        # works with strings, which is better suited for this module.
        weights = [1 / k for _ in range(k)]
        score = nltkbleu.sentence_bleu(
            [normalize_answer(a).split(" ") for a in answers],
            normalize_answer(guess).split(" "),
            smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1,
            weights=weights,
        )
        return BleuMetric(score) 
Example #2
Source File: seq2seq.py    From chainer with MIT License 6 votes vote down vote up
def __call__(self, trainer):
        device = self.device

        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [device.send(x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu}) 
Example #3
Source File: precision_recall.py    From cotk with Apache License 2.0 6 votes vote down vote up
def _score(self, gen: List[int], reference: List[int]) -> float:
		'''Return a BLEU score \in [0, 1] to calculate BLEU-ngram precision and recall.

		Arguments:
			gen (list): list of generated word ids.
			reference (list): list of word ids of a reference.

		Here is an Example:

			>>> gen = [4,5]
			>>> reference = [5,6]
			>>> self._score(gen, reference)
			0.150 # assume self.weights = [0.25,0.25,0.25,0.25]
		'''
		gen = self._replace_unk(gen)
		return sentence_bleu([reference], gen, self.weights, SmoothingFunction().method1) 
Example #4
Source File: metrics.py    From KBRD with MIT License 6 votes vote down vote up
def _bleu(guess, answers):
    """Compute approximate BLEU score between guess and a set of answers."""
    if nltkbleu is None:
        # bleu library not installed, just return a default value
        return None
    # Warning: BLEU calculation *should* include proper tokenization and
    # punctuation etc. We're using the normalize_answer for everything though,
    # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
    # going to be slower than fairseq's (which is written in C), but fairseq's
    # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
    # works with strings, which is better suited for this module.
    return nltkbleu.sentence_bleu(
        [normalize_answer(a).split(" ") for a in answers],
        normalize_answer(guess).split(" "),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1,
    ) 
Example #5
Source File: metrics.py    From neural_chat with MIT License 6 votes vote down vote up
def _bleu(guess, answers):
    """Compute approximate BLEU score between guess and a set of answers."""
    if nltkbleu is None:
        # bleu library not installed, just return a default value
        return None
    # Warning: BLEU calculation *should* include proper tokenization and
    # punctuation etc. We're using the normalize_answer for everything though,
    # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
    # going to be slower than fairseq's (which is written in C), but fairseq's
    # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
    # works with strings, which is better suited for this module.
    return nltkbleu.sentence_bleu(
        [normalize_answer(a).split(" ") for a in answers],
        normalize_answer(guess).split(" "),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1,
    ) 
Example #6
Source File: utils.py    From quick-nlp with MIT License 6 votes vote down vote up
def print_batch(learner: Learner, modeldata: ModelData, input_field, output_field, num_batches=1, num_sentences=-1,
                is_test=False, num_beams=1, weights=None, smoothing_function=None):
    predictions, targets, inputs = learner.predict_with_targs_and_inputs(is_test=is_test, num_beams=num_beams)
    weights = (1 / 3., 1 / 3., 1 / 3.) if weights is None else weights
    smoothing_function = SmoothingFunction().method1 if smoothing_function is None else smoothing_function
    blue_scores = []
    for batch_num, (input, target, prediction) in enumerate(zip(inputs, targets, predictions)):
        inputs_str: BatchBeamTokens = modeldata.itos(input, input_field)
        predictions_str: BatchBeamTokens = modeldata.itos(prediction, output_field)
        targets_str: BatchBeamTokens = modeldata.itos(target, output_field)
        for index, (inp, targ, pred) in enumerate(zip(inputs_str, targets_str, predictions_str)):
            blue_score = sentence_bleu([targ], pred, smoothing_function=smoothing_function, weights=weights)
            print(
                f'batch: {batch_num} sample : {index}\ninput: {" ".join(inp)}\ntarget: { " ".join(targ)}\nprediction: {" ".join(pred)}\nbleu: {blue_score}\n\n')
            blue_scores.append(blue_score)
            if 0 < num_sentences <= index - 1:
                break
        if 0 < num_batches <= batch_num - 1:
            break
    print(f'mean bleu score: {np.mean(blue_scores)}') 
Example #7
Source File: seq2seq_chainerio.py    From pfio with MIT License 6 votes vote down vote up
def forward(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu}) 
Example #8
Source File: seq2seq.py    From convolutional_seq2seq with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __call__(self, trainer):
        print('## Calculate BLEU')
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                references = []
                hypotheses = []
                for i in range(0, len(self.test_data), self.batch):
                    sources, targets = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(self.device, x) for x in sources]
                    ys = [y.tolist()
                          for y in self.model.translate(sources, self.max_length)]
                    hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1) * 100
        print('BLEU:', bleu)
        reporter.report({self.key: bleu}) 
Example #9
Source File: seq2seq.py    From chainer with MIT License 6 votes vote down vote up
def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        reporter.report({self.key: bleu}) 
Example #10
Source File: test_bleu.py    From cotk with Apache License 2.0 6 votes vote down vote up
def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]):
			gen_sen_processed = dataloader.trim_in_ids(gen_sen)
			resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:])
			refs.append(resp_sen_processed)
			gens.append(gen_sen_processed)
		gens = replace_unk(gens)
		bleu_irl_bw, bleu_irl_fw = [], []
		for i in range(len(gens)):
			bleu_irl_fw.append(sentence_bleu(refs, gens[i], smoothing_function=SmoothingFunction().method1))
		for i in range(len(refs)):
			bleu_irl_bw.append(sentence_bleu(gens, refs[i], smoothing_function=SmoothingFunction().method1))

		fw_bleu = (1.0 * sum(bleu_irl_fw) / len(bleu_irl_fw))
		bw_bleu = (1.0 * sum(bleu_irl_bw) / len(bleu_irl_bw))
		return 2.0 * bw_bleu * fw_bleu / (fw_bleu + bw_bleu) 
Example #11
Source File: bleu_metrics.py    From dialog-eval with MIT License 6 votes vote down vote up
def __init__(self, smoothing):
    '''
    Params:
      :smoothing: Smoothing method for bleu.
    '''
    self.metrics = {'bleu-1': [], 'bleu-2': [], 'bleu-3': [], 'bleu-4': []}
    self.smoothing = [bleu_score.SmoothingFunction().method0,
                      bleu_score.SmoothingFunction().method1,
                      bleu_score.SmoothingFunction().method2,
                      bleu_score.SmoothingFunction().method3,
                      bleu_score.SmoothingFunction().method4,
                      bleu_score.SmoothingFunction().method5,
                      bleu_score.SmoothingFunction().method6,
                      bleu_score.SmoothingFunction().method7]
    self.smoothing = self.smoothing[smoothing]

  # Calculate metrics for one example. 
Example #12
Source File: bleu.py    From dialogbot with Apache License 2.0 6 votes vote down vote up
def bleu(answer_file, standard_answer_file):
    rf_answer = open(answer_file, 'r', "utf-8")
    rf_standard_answer = open(standard_answer_file, 'r', "utf-8")
    answer_lines = rf_answer.readlines()
    standard_answer_lines = rf_standard_answer.readlines()
    # compute score
    scores = []
    for i in range(len(answer_lines)):
        candidate = list(answer_lines[i].strip())
        each_score = 0
        for j in range(10):
            references = []
            standard_answer_line = standard_answer_lines[i * 11 + j].strip().split('\t')
            references.append(list(standard_answer_line[0].strip()))
            standard_score = standard_answer_line[1]
            bleu_score = sentence_bleu(references, candidate, weights=(0.35, 0.45, 0.1, 0.1),
                                       smoothing_function=SmoothingFunction().method1)
            each_score = bleu_score * float(standard_score) + each_score
        scores.append(each_score / 10)
    rf_answer.close()
    rf_standard_answer.close()
    score_final = sum(scores) / float(len(answer_lines))
    precision_score = round(score_final, 6)
    return precision_score 
Example #13
Source File: metric.py    From MultiTurnDialogZoo with MIT License 6 votes vote down vote up
def cal_BLEU_nltk(refer, candidate, ngram=1):
    '''
    SmoothingFunction refer to https://github.com/PaddlePaddle/models/blob/a72760dff8574fe2cb8b803e01b44624db3f3eff/PaddleNLP/Research/IJCAI2019-MMPMS/mmpms/utils/metrics.py
    '''
    smoothie = SmoothingFunction().method7
    if ngram == 1:
        weight = (1, 0, 0, 0)
    elif ngram == 2:
        weight = (0.5, 0.5, 0, 0)
    elif ngram == 3:
        weight = (0.33, 0.33, 0.33, 0)
    elif ngram == 4:
        weight = (0.25, 0.25, 0.25, 0.25)
    return sentence_bleu(refer, candidate, 
                         weights=weight, 
                         smoothing_function=smoothie)

# BLEU of nlg-eval 
Example #14
Source File: evaluators.py    From ConvLab with MIT License 6 votes vote down vote up
def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1 
Example #15
Source File: evaluators.py    From ConvLab with MIT License 6 votes vote down vote up
def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report) 
Example #16
Source File: evaluators.py    From NeuralDialog-LaRL with Apache License 2.0 6 votes vote down vote up
def get_report(self):
        tokenize = get_tokenize()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        for label, hyp in zip(self.labels, self.hyps):
            # label = label.replace(EOS, '')
            # hyp = hyp.replace(EOS, '')
            # ref_tokens = tokenize(label)[1:]
            # hyp_tokens = tokenize(hyp)[1:]
            ref_tokens = tokenize(label)
            hyp_tokens = tokenize(hyp)
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)
        bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        report = '\n===== BLEU = %f =====\n' % (bleu,)
        return '\n===== REPORT FOR DATASET {} ====={}'.format(self.data_name, report) 
Example #17
Source File: evaluators.py    From NeuralDialog-LaRL with Apache License 2.0 6 votes vote down vote up
def get_report(self):
        tokenize = lambda x: x.split()
        print('Generate report for {} samples'.format(len(self.hyps)))
        refs, hyps = [], []
        tp, fp, fn = 0, 0, 0
        for label, hyp in zip(self.labels, self.hyps):
            ref_tokens = [BOS] + tokenize(label.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            hyp_tokens = [BOS] + tokenize(hyp.replace(SYS, '').replace(USR, '').strip()) + [EOS]
            refs.append([ref_tokens])
            hyps.append(hyp_tokens)

            ref_entities = self._parse_entities(ref_tokens)
            hyp_entities = self._parse_entities(hyp_tokens)
            tpp, fpp, fnn = self._get_tp_fp_fn(ref_entities, hyp_entities)
            tp += tpp
            fp += fpp
            fn += fnn

        # bleu = corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
        bleu = BLEUScorer().score(hyps, refs)
        prec, rec, f1 = self._get_prec_recall(tp, fp, fn)
        report = "\nBLEU score {}\nEntity precision {:.4f} recall {:.4f} and f1 {:.4f}\n".format(bleu, prec, rec, f1)
        return report, bleu, prec, rec, f1 
Example #18
Source File: metrics.py    From deepAPI with MIT License 6 votes vote down vote up
def sim_bleu(self, hyps, ref):
        """
        :param ref - a list of tokens of the reference
        :param hyps - a list of tokens of the hypothesis
    
        :return maxbleu - recall bleu
        :return avgbleu - precision bleu
        """
        scores = []
        for hyp in hyps:
            try:
               # scores.append(sentence_bleu([ref], hyp, smoothing_function=SmoothingFunction().method7,
               #                         weights=[1./4, 1./4, 1./4, 1./4]))
                scores.append(smoothed_bleu(list(bleu_stats(hyp, ref))))
            except:
                scores.append(0.0)
        return np.max(scores), np.mean(scores) 
Example #19
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for gen_sen, resp_sen in zip(input[gen_key], input[reference_key]):
			gen_sen_processed = dataloader.trim_in_ids(gen_sen)
			resp_sen_processed = dataloader.trim_in_ids(resp_sen[1:])
			refs.append([resp_sen_processed])
			gens.append(gen_sen_processed)
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3) 
Example #20
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def get_bleu(self, dataloader, input, reference_key, gen_key):
		refs = []
		gens = []
		for i in range(len(input[reference_key])):
			for resp_sen, gen_sen in zip(input[reference_key][i], input[gen_key][i]):
				gen_sen_processed = dataloader.trim_in_ids(gen_sen)
				resp_sen_processed = dataloader.trim_in_ids(resp_sen)
				gens.append(gen_sen_processed)
				refs.append([resp_sen_processed[1:]])
		gens = replace_unk(gens)
		return corpus_bleu(refs, gens, smoothing_function=SmoothingFunction().method3) 
Example #21
Source File: bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")

		if self.tokenizer:
			self._do_tokenize()

		if "unk" in self.dataloader.get_special_tokens_mapping():
			self.hyps = replace_unk(self.hyps, self.dataloader.get_special_tokens_mapping()["unk"])
		try:
			weights = np.ones(self.ngram) / self.ngram
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, weights=weights, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.") from None
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result 
Example #22
Source File: bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def close(self) -> Dict[str, Any]:
		'''Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if (not self.hyps) or (not self.refs):
			raise RuntimeError("The metric has not been forwarded data correctly.")
		self.hyps = replace_unk(self.hyps, self.dataloader.unk_id)

		self._hash_unordered_list(self.refs)

		try:
			result.update({"bleu": \
				corpus_bleu(self.refs, self.hyps, smoothing_function=SmoothingFunction().method3), \
				"bleu hashvalue": self._hashvalue()})
		except ZeroDivisionError as _:
			if not self.ignore_smoothing_error:
				raise ZeroDivisionError("Bleu smoothing divided by zero. This is a known bug of corpus_bleu, \
				usually caused when there is only one sample and the sample length is 1.")
			result.update({"bleu": \
					0, \
					"bleu hashvalue": self._hashvalue()})
		return result 
Example #23
Source File: test_bleu.py    From cotk with Apache License 2.0 5 votes vote down vote up
def get_self_bleu(self, dataloader, input, gen_key):
		gens = []
		for gen_sen in input[gen_key]:
			gen_sen_processed = dataloader.trim_in_ids(gen_sen)
			gens.append(gen_sen_processed)
		refs = copy.deepcopy(gens)
		_refs = replace_unk(refs)
		bleu_irl = []
		for i in range(len(gens)):
			bleu_irl.append(sentence_bleu(
				refs[:i] + refs[i + 1:], _refs[i], smoothing_function=SmoothingFunction().method1))
		return 1.0 * sum(bleu_irl) / len(bleu_irl) 
Example #24
Source File: evaluate.py    From tatk with Apache License 2.0 5 votes vote down vote up
def get_bleu4(dialog_acts, golden_utts, gen_utts):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        utt = utt.lower()
        gen = gen.lower()
        for da, svs in das.items():
            domain, act = da.split('-')
            if act == 'Request' or domain == 'general':
                continue
            else:
                for s, v in sorted(svs, key=lambda x: x[0]):
                    if s == 'Internet' or s == 'Parking' or s == 'none' or v == 'none':
                        continue
                    else:
                        v = v.lower()
                        if (' ' + v in utt) or (v + ' ' in utt):
                            utt = utt.replace(v, '{}-{}'.format(da, s), 1)
                        if (' ' + v in gen) or (v + ' ' in gen):
                            gen = gen.replace(v, '{}-{}'.format(da, s), 1)
        hash_key = ''
        for da in sorted(das.keys()):
            for s, v in sorted(das[da], key=lambda x: x[0]):
                hash_key += da + '-' + s + ';'
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append(gen)
    # pprint(das2utts)
    refs, gens = [], []
    for das in das2utts.keys():
        for gen in das2utts[das]['gens']:
            refs.append([s.split() for s in das2utts[das]['refs']])
            gens.append(gen.split())
    bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
    return bleu 
Example #25
Source File: evaluators.py    From NeuralDialog-ZSDG with Apache License 2.0 5 votes vote down vote up
def get_report(self, include_error=False):
        reports = []
        tokenize = get_tokenize()

        for domain, labels in self.domain_labels.items():
            predictions = self.domain_hyps[domain]
            self.logger.info("Generate report for {} for {} samples".format(domain, len(predictions)))
            refs, hyps = [], []

            # find entity precision, recall and f1
            tp, fp, fn = 0.0, 0.0, 0.0

            for label, hyp in zip(labels, predictions):
                label = label.replace(EOS, '').replace(BOS, '')
                hyp = hyp.replace(EOS, '').replace(BOS, '')
                ref_tokens = tokenize(label)[2:]
                hyp_tokens = tokenize(hyp)[2:]

                refs.append([ref_tokens])
                hyps.append(hyp_tokens)

                label_ents = self.pred_ents(label, tokenize, None)
                hyp_ents = self.pred_ents(hyp, tokenize, None)
                # hyp_ents = list(set(hyp_ents))

                ttpp, ffpp, ffnn = self._get_tp_fp_fn(label_ents, hyp_ents)
                tp += ttpp
                fp += ffpp
                fn += ffnn

            ent_precision, ent_recall, ent_f1 = self._get_prec_recall(tp, fp, fn)

            # compute corpus level scores
            bleu = bleu_score.corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method1)
            report = "\nDomain: %s BLEU %f\n Entity precision %f recall %f and f1 %f\n" \
                     % (domain, bleu, ent_precision, ent_recall, ent_f1)
            reports.append(report)

        return "\n==== REPORT===={report}".format(report="========".join(reports)) 
Example #26
Source File: metrics.py    From quick-nlp with MIT License 5 votes vote down vote up
def bleu_score(preds, targs, stoi=None):
    sf = SmoothingFunction().method1
    preds = torch.max(preds, dim=-1)[1][:-1]
    bleus = np.zeros(targs.size(1))
    for res in zip(to_np(targs, preds)):
        if len(res[1]) > 2:
            bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(1 / 3., 1 / 3., 1 / 3.))
        elif len(res[1]) == 2:
            bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(0.5, 0.5))
        else:
            bleu = sentence_bleu([res[1]], res[2], smoothing_function=sf, weights=(1.0,))
        bleus.append(bleu)
    return 
Example #27
Source File: utils.py    From quick-nlp with MIT License 5 votes vote down vote up
def print_dialogue_batch(learner: Learner, modeldata: ModelData, input_field, output_field, num_batches=1,
                         num_sentences=-1, is_test=False,
                         num_beams=1, smoothing_function=None, weights=None):
    weights = (1 / 3., 1 / 3., 1 / 3.) if weights is None else weights
    smoothing_function = SmoothingFunction().method1 if smoothing_function is None else smoothing_function
    predictions, targets, inputs = learner.predict_with_targs_and_inputs(is_test=is_test, num_beams=num_beams)
    blue_scores = []
    for batch_num, (input, target, prediction) in enumerate(zip(inputs, targets, predictions)):
        input = np.transpose(input, [1, 2, 0])  # transpose number of utterances to beams [sl, bs, nb]
        inputs_str: BatchBeamTokens = modeldata.itos(input, input_field)
        inputs_str: List[str] = ["\n".join(conv) for conv in inputs_str]
        predictions_str: BatchBeamTokens = modeldata.itos(prediction, output_field)
        targets_str: BatchBeamTokens = modeldata.itos(target, output_field)
        for index, (inp, targ, pred) in enumerate(zip(inputs_str, targets_str, predictions_str)):
            if targ[0].split() == pred[0].split()[1:]:
                blue_score = 1
            else:
                blue_score = sentence_bleu([targ[0].split()], pred[0].split()[1:],
                                           smoothing_function=smoothing_function,
                                           weights=weights
                                           )
            print(
                f'BATCH: {batch_num} SAMPLE : {index}\nINPUT:\n{"".join(inp)}\nTARGET:\n{ "".join(targ)}\nPREDICTON:\n{"".join(pred)}\nblue: {blue_score}\n\n')
            blue_scores.append(blue_score)
            if 0 < num_sentences <= index - 1:
                break
        if 0 < num_batches <= batch_num - 1:
            break
    print(f'bleu score: mean: {np.mean(blue_scores)}, std: {np.std(blue_scores)}') 
Example #28
Source File: bleu.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple = (1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure / bpenalty 
Example #29
Source File: test_bleu.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references, hypothesis, weights=(1.0 / i,) * i
                    )
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(
                        references,
                        hypothesis,
                        weights=(1.0 / i,) * i,
                        smoothing_function=chencherry.method3,
                    )
                    assert abs(mteval_bleu - nltk_bleu) < 0.005 
Example #30
Source File: utils.py    From Deep-Reinforcement-Learning-Hands-On with MIT License 5 votes vote down vote up
def calc_bleu_many(cand_seq, ref_sequences):
    sf = bleu_score.SmoothingFunction()
    return bleu_score.sentence_bleu(ref_sequences, cand_seq,
                                    smoothing_function=sf.method1,
                                    weights=(0.5, 0.5))