python source code of mixin

claf-master
- script
  - download_wikisql.sh
  - make_squad_synthetic_data.py
  - convert_checkpoint_to_bert_model.py
  - install_mecab.sh
  - convert_embedding_to_vocab_txt.py
  - plot.py
- eval.py
- .coveragerc
- reports
  - inference_result
    - bert_for_qa-cpu.json
    - bidaf-cpu.json
    - docqa+elmo-cpu.json
    - bidaf+elmo-cpu.json
    - docqa-cpu.json
    - qanet-cpu.json
    - drqa-cpu.json
  - summary
    - qanet.json
    - bidaf+elmo.json
    - bidaf.json
    - drqa.json
    - bert_for_qa.json
    - docqa.json
    - docqa+elmo.json
- .nojekyll
- images
  - coverage_badge.svg
  - favicon.ico
- LICENSE
- base_config
  - wikisql
    - sqlnet.json
  - test
    - bert_for_tok_cls.json
    - docqa_no_answer.json
    - bert_for_seq_cls.json
    - mt_bert.json
    - qanet.json
    - bidaf+cove.json
    - bidaf+elmo.json
    - qqp_bert.json
    - sqlnet.json
    - stsb_bert.json
    - open_qa.json
    - rte_bert.json
    - mnlim_bert.json
    - drqa_sparse_to_embedding.json
    - mrpc_bert.json
    - sst_bert.json
    - ssa.json
    - bidaf.json
    - bidaf+bert.json
    - wnli_bert.json
    - cola_roberta.json
    - bidaf_no_answer.json
    - drqa.json
    - stsb_roberta.json
    - rte_roberta.json
    - bert_for_qa.json
    - cola_bert.json
    - roberta_for_qa.json
    - docqa.json
    - bert_for_multi_task.json
    - qnli_bert.json
  - multi_task
    - bert_base_glue.json
    - bert_large_glue+squad.json
    - bert_large_glue.json
    - bert_base_glue+squad.json
  - squad
    - bert_large_uncased.json
    - bert_base_uncased.json
    - docqa_no_answer.json
    - roberta_large.json
    - drqa_paper.json
    - roberta_base.json
    - qanet.json
    - bidaf+elmo.json
    - bidaf.json
    - qanet_paper.json
    - bidaf_no_answer.json
    - drqa.json
    - docqa.json
    - docqa+elmo.json
  - conll2003
    - bert_large_cased.json
  - glue
    - mnlimm_roberta.json
    - mnlimm_bert.json
    - qqp_bert.json
    - stsb_bert.json
    - qqp_roberta.json
    - rte_bert.json
    - mnlim_bert.json
    - mrpc_bert.json
    - sst_bert.json
    - mrpc_roberta.json
    - wnli_roberta.json
    - wnli_bert.json
    - sst_roberta.json
    - cola_roberta.json
    - mnlim_roberta.json
    - stsb_roberta.json
    - rte_roberta.json
    - cola_bert.json
    - qnli_roberta.json
    - qnli_bert.json
  - korquad
    - bert_base_multilingual_cased.json
    - bidaf.json
    - bert_base_multilingual_uncased.json
    - docqa.json
  - cola
    - bert_large_uncased.json
    - bert_base_uncased.json
    - structured_self_attention.json
- .readthedocs.yml
- CONTRIBUTING.md
- claf
  - decorator
    - register.py
    - __init__.py
    - arguments.py
  - machine
    - module.py
    - open_qa.py
    - components
      - retrieval
        tfidf.py
        __init__.py
      - __init__.py
    - ensemble_topk.py
    - nlu.py
    - __init__.py
    - knowlege_base
      - docs.py
      - __init__.py
    - base.py
  - __version__.py
  - tokens
    - tokenizer
      - char.py
      - pass_text.py
      - subword.py
      - word.py
      - sent.py
      - bpe.py
      - __init__.py
      - utils.py
      - base.py
    - token_embedder
      - basic_embedder.py
      - __init__.py
      - reading_comprehension_embedder.py
      - base.py
    - cove.py
    - linguistic.py
    - token_maker.py
    - hangul.py
    - elmo.py
    - indexer
      - elmo_indexer.py
      - char_indexer.py
      - exact_match_indexer.py
      - word_indexer.py
      - __init__.py
      - bert_indexer.py
      - linguistic_indexer.py
      - base.py
    - __init__.py
    - text_handler.py
    - vocabulary.py
    - embedding
      - cove_embedding.py
      - elmo_embedding.py
      - char_embedding.py
      - word_embedding.py
      - bert_embedding.py
      - __init__.py
      - frequent_word_embedding.py
      - sparse_feature.py
      - base.py
  - nsml.py
  - config
    - pattern.py
    - namespace.py
    - registry.py
    - __init__.py
    - utils.py
    - args.py
  - learn
    - tensorboard.py
    - optimization
      - optimizer.py
      - exponential_moving_avarage.py
      - __init__.py
      - learning_rate_scheduler.py
    - experiment.py
    - __init__.py
    - utils.py
    - trainer.py
    - mode.py
  - factory
    - optimizer.py
    - tokens.py
    - data_loader.py
    - model.py
    - data_reader.py
    - __init__.py
    - base.py
  - data
    - dto
      - batch.py
      - __init__.py
      - helper.py
      - bert_feature.py
    - dataset
      - seq_cls.py
      - wikisql.py
      - __init__.py
      - bert
        seq_cls.py
        regression.py
        tok_cls.py
        __init__.py
        multi_task.py
        squad.py
      - squad.py
      - base.py
    - data_handler.py
    - __init__.py
    - utils.py
    - collate.py
    - reader
      - seq_cls.py
      - wikisql.py
      - __init__.py
      - cola.py
      - bert
        conll2003.py
        seq_cls.py
        regression.py
        glue
        mnli.py
        wnli.py
        stsb.py
        qqp.py
        mrpc.py
        rte.py
        __init__.py
        sst.py
        cola.py
        qnli.py
        tok_cls.py
        __init__.py
        multi_task.py
        squad.py
      - squad.py
      - base.py
  - __init__.py
  - utils.py
  - metric
    - glue.py
    - classification.py
    - squad_v1_official.py
    - wikisql_lib
      - query.py
      - dbengine.py
      - __init__.py
    - regression.py
    - wikisql_official.py
    - __init__.py
    - korquad_v1_official.py
    - squad_v2_official.py
  - model
    - sequence_classification
      - mixin.py
      - bert.py
      - __init__.py
      - roberta.py
      - structured_self_attention.py
    - multi_task
      - category.py
      - mixin.py
      - bert.py
      - __init__.py
    - reading_comprehension
      - docqa_no_answer.py
      - bidaf.py
      - bidaf_no_answer.py
      - mixin.py
      - docqa.py
      - bert.py
      - drqa.py
      - qanet.py
      - __init__.py
      - roberta.py
    - cls_utils.py
    - regression
      - mixin.py
      - bert.py
      - __init__.py
      - roberta.py
    - __init__.py
    - token_classification
      - mixin.py
      - bert.py
      - __init__.py
    - semantic_parsing
      - mixin.py
      - sqlnet.py
      - __init__.py
      - utils.py
    - base.py
  - modules
    - activation.py
    - layer
      - positionwise.py
      - residual.py
      - normalization.py
      - highway.py
      - __init__.py
      - scalar_mix.py
    - functional.py
    - attention
      - co_attention.py
      - bi_attention.py
      - docqa_attention.py
      - __init__.py
      - seq_attention.py
      - multi_head_attention.py
    - encoder
      - __init__.py
      - positional.py
      - lstm_cell_with_projection.py
    - __init__.py
    - conv
      - __init__.py
      - pointwise_conv.py
      - depthwise_separable_conv.py
    - initializer.py
- predict.py
- setup.py
- machine_config
  - nlu.json
  - ko_wiki.json
- train.py
- .travis.yml
- README.md
- tests
  - integration
    - test_config.py
    - utils.py
    - test_semantic_parsing.py
    - test_multi_task.py
    - test_sequence_classification.py
    - test_machine.py
    - test_reading_comprehension.py
    - test_tokenizers.py
    - test_token_classification.py
  - claf
    - machine
      - knowlege_base
        test_docs.py
    - tokens
      - test_vocabulary.py
    - data
      - test_batch.py
    - modules
      - test_functional.py
  - __init__.py
- pyproject.toml
- requirements.txt
- index.html
- Dockerfile
- NOTICE
- .gitignore
- machine.py
- docs
  - claf.modules.attention.rst
  - claf.metric.rst
  - claf.rst
  - claf.model.token_classification.rst
  - Makefile
  - contents
    - dataset_and_model.md
    - tokens.md
    - pretrained_vector.md
  - make.bat
  - claf.config.rst
  - claf.tokens.rst
  - reports
    - wikisql.md
    - squad.md
    - korquad.md
    - glue.md
    - historyqa.md
  - claf.machine.components.retrieval.rst
  - references.md
  - claf.tokens.indexer.rst
  - claf.config.factory.rst
  - claf.machine.components.rst
  - claf.modules.layer.rst
  - claf.data.rst
  - claf.model.reading_comprehension.rst
  - claf.modules.rst
  - _build
    - doctrees
      - modules.doctree
      - claf.tokens.embedding.doctree
      - references.doctree
      - claf.machine.components.doctree
      - claf.model.token_classification.doctree
      - claf.data.reader.bert.doctree
      - claf.tokens.tokenizer.doctree
      - contents
        pretrained_vector.doctree
        dataset_and_model.doctree
        tokens.doctree
      - claf.machine.doctree
      - claf.modules.doctree
      - reports
        wikisql.doctree
        korquad.doctree
        historyqa.doctree
        squad.doctree
        glue.doctree
      - claf.modules.attention.doctree
      - claf.data.dataset.doctree
      - claf.data.doctree
      - claf.metric.doctree
      - claf.machine.components.retrieval.doctree
      - claf.modules.encoder.doctree
      - claf.modules.layer.doctree
      - claf.model.semantic_parsing.doctree
      - claf.decorator.doctree
      - claf.data.reader.doctree
      - claf.learn.doctree
      - claf.tokens.indexer.doctree
      - claf.config.doctree
      - claf.modules.conv.doctree
      - claf.config.factory.doctree
      - claf.model.doctree
      - claf.doctree
      - summary
        reading_comprehension.doctree
      - claf.tokens.token_embedder.doctree
      - claf.model.sequence_classification.doctree
      - index.doctree
    - html
      - references.html
      - claf.data.reader.bert.html
      - claf.decorator.html
      - claf.modules.html
      - contents
        pretrained_vector.html
        tokens.html
        dataset_and_model.html
      - claf.tokens.html
      - claf.modules.attention.html
      - claf.modules.conv.html
      - reports
        wikisql.html
        historyqa.html
        glue.html
        korquad.html
        squad.html
      - claf.data.html
      - .nojekyll
      - claf.tokens.embedding.html
      - claf.model.semantic_parsing.html
      - claf.metric.html
      - claf.model.html
      - claf.machine.components.html
      - claf.model.reading_comprehension.html
      - claf.tokens.indexer.html
      - _sources
        claf.modules.conv.rst.txt
        claf.tokens.tokenizer.rst.txt
        claf.data.reader.bert.rst.txt
        claf.model.semantic_parsing.rst.txt
        claf.rst.txt
        index.rst.txt
        claf.config.rst.txt
        claf.data.reader.rst.txt
        claf.tokens.indexer.rst.txt
        claf.tokens.embedding.rst.txt
        claf.machine.rst.txt
        contents
        tokens.md.txt
        pretrained_vector.md.txt
        dataset_and_model.md.txt
        reports
        glue.md.txt
        squad.md.txt
        wikisql.md.txt
        historyqa.md.txt
        korquad.md.txt
        claf.modules.encoder.rst.txt
        claf.tokens.rst.txt
        claf.config.factory.rst.txt
        claf.model.reading_comprehension.rst.txt
        claf.model.token_classification.rst.txt
        modules.rst.txt
        references.md.txt
        claf.machine.components.rst.txt
        claf.modules.rst.txt
        claf.modules.attention.rst.txt
        claf.model.sequence_classification.rst.txt
        claf.modules.layer.rst.txt
        claf.learn.rst.txt
        claf.data.rst.txt
        claf.decorator.rst.txt
        claf.machine.components.retrieval.rst.txt
        claf.data.dataset.rst.txt
        claf.model.rst.txt
        summary
        reading_comprehension.md.txt
        claf.metric.rst.txt
        claf.tokens.token_embedder.rst.txt
      - searchindex.js
      - claf.model.token_classification.html
      - claf.model.sequence_classification.html
      - claf.data.dataset.html
      - claf.config.factory.html
      - claf.tokens.token_embedder.html
      - _modules
        claf
        decorator
        arguments.html
        register.html
        machine
        open_qa.html
        components
        retrieval
        tfidf.html
        nlu.html
        base.html
        module.html
        tokens
        tokenizer
        word.html
        pass_text.html
        char.html
        utils.html
        base.html
        subword.html
        bpe.html
        sent.html
        token_embedder
        basic_embedder.html
        base.html
        reading_comprehension_embedder.html
        linguistic.html
        cove.html
        vocabulary.html
        token_maker.html
        indexer
        elmo_indexer.html
        bert_indexer.html
        char_indexer.html
        word_indexer.html
        base.html
        linguistic_indexer.html
        exact_match_indexer.html
        hangul.html
        embedding
        sparse_feature.html
        bert_embedding.html
        frequent_word_embedding.html
        base.html
        cove_embedding.html
        char_embedding.html
        elmo_embedding.html
        word_embedding.html
        text_handler.html
        elmo.html
        utils.html
        config
        pattern.html
        utils.html
        namespace.html
        factory
        optimizer.html
        data_reader.html
        base.html
        tokens.html
        data_loader.html
        model.html
        registry.html
        tokens.html
        learn
        utils.html
        trainer.html
        mode.html
        experiment.html
        tensorboard.html
        data
        data_handler.html
        batch.html
        collate.html
        utils.html
        dataset
        wikisql.html
        tok_cls_bert.html
        seq_cls.html
        squad_bert.html
        base.html
        squad.html
        seq_cls_bert.html
        bert
        tok_cls.html
        regression.html
        seq_cls.html
        multi_task.html
        squad.html
        reader
        cola.html
        wikisql.html
        seq_cls.html
        base.html
        squad.html
        bert
        tok_cls.html
        cola.html
        rte.html
        wnli.html
        mrpc.html
        conll2003.html
        qqp.html
        regression.html
        seq_cls.html
        qnli.html
        glue
        cola.html
        rte.html
        wnli.html
        mrpc.html
        qqp.html
        qnli.html
        stsb.html
        mnli.html
        sst.html
        stsb.html
        multi_task.html
        mnli.html
        squad.html
        sst.html
        metric
        classification.html
        squad_v1_official.html
        wikisql_official.html
        squad_v2_official.html
        model
        sequence_classification
        bert_for_seq_cls.html
        roberta.html
        bert.html
        mixin.html
        structured_self_attention.html
        base.html
        reading_comprehension
        roberta.html
        bert_for_qa.html
        bert.html
        bidaf.html
        drqa.html
        docqa_no_answer.html
        docqa.html
        bidaf_no_answer.html
        mixin.html
        qanet.html
        token_classification
        bert.html
        bert_for_tok_cls.html
        mixin.html
        semantic_parsing
        sqlnet.html
        utils.html
        mixin.html
        cls_utils.html
        modules
        activation.html
        layer
        residual.html
        normalization.html
        highway.html
        scalar_mix.html
        positionwise.html
        functional.html
        attention
        multi_head_attention.html
        co_attention.html
        bi_attention.html
        seq_attention.html
        docqa_attention.html
        encoder
        lstm_cell_with_projection.html
        positional.html
        initializer.html
        conv
        pointwise_conv.html
        depthwise_separable_conv.html
        index.html
      - claf.tokens.tokenizer.html
      - _images
      - claf.modules.layer.html
      - claf.machine.components.retrieval.html
      - claf.html
      - .buildinfo
      - objects.inv
      - genindex.html
      - claf.data.reader.html
      - py-modindex.html
      - claf.modules.encoder.html
      - index.html
      - search.html
      - claf.config.html
      - claf.learn.html
      - modules.html
      - _static
        ajax-loader.gif
        jquery.js
        basic.css
        pygments.css
        searchtools.js
        theme_overrides.css
        documentation_options.js
        favicon.ico
        language_data.js
        doctools.js
        fonts
        fontawesome-webfont.woff2
        fontawesome-webfont.woff
        fontawesome-webfont.eot
        fontawesome-webfont.ttf
        websupport.js
        underscore-1.3.1.js
        js
        theme.js
        modernizr.min.js
        underscore.js
        css
        badge_only.css
        theme.css
      - summary
        reading_comprehension.html
      - claf.machine.html
  - claf.data.reader.bert.rst
  - claf.data.reader.rst
  - claf.modules.encoder.rst
  - claf.tokens.token_embedder.rst
  - claf.model.sequence_classification.rst
  - modules.rst
  - claf.model.rst
  - claf.modules.conv.rst
  - requirements.txt
  - index.rst
  - conf.py
  - claf.learn.rst
  - claf.data.dataset.rst
  - claf.tokens.tokenizer.rst
  - claf.model.semantic_parsing.rst
  - _templates
    - package.rst
    - modules.rst
  - _static
    - theme_overrides.css
  - summary
    - reading_comprehension.md
  - claf.machine.rst
  - claf.decorator.rst
  - claf.tokens.embedding.rst

from collections import OrderedDict

import numpy as np
import torch
import torch.nn.functional as F

from claf.decorator import arguments_required
from claf.metric import korquad_v1_official, squad_v1_official, squad_v2_official
from claf.model.base import ModelBase


class ReadingComprehension:
    """
    Reading Comprehension Mixin Class

    * Args:
        token_embedder: 'RCTokenEmbedder', Used to embed the 'context' and 'question'.

    """

    def get_best_span(self, span_start_logits, span_end_logits, answer_maxlen=None):
        """
        Take argmax of constrained score_s * score_e.

        * Args:
            span_start_logits: independent start logits
            span_end_logits: independent end logits

        * Kwargs:
            answer_maxlen: max span length to consider (default is None -> All)
        """

        B = span_start_logits.size(0)
        best_word_span = span_start_logits.new_zeros((B, 2), dtype=torch.long)

        score_starts = F.softmax(span_start_logits, dim=-1)
        score_ends = F.softmax(span_end_logits, dim=-1)

        max_len = answer_maxlen or score_starts.size(1)

        for i in range(score_starts.size(0)):
            # Outer product of scores to get full p_s * p_e matrix
            scores = torch.ger(score_starts[i], score_ends[i])

            # Zero out negative length and over-length span scores
            scores.triu_().tril_(max_len - 1)

            # Take argmax or top n
            scores = scores.detach().cpu().numpy()
            scores_flat = scores.flatten()

            idx_sort = [np.argmax(scores_flat)]

            s_idx, e_idx = np.unravel_index(idx_sort, scores.shape)
            best_word_span[i, 0] = int(s_idx[0])
            best_word_span[i, 1] = int(e_idx[0])

        return best_word_span

    def _make_span_metrics(self, predictions):
        """ span accuracy metrics """
        start_accuracy, end_accuracy, span_accuracy = 0, 0, 0

        for index, preds in predictions.items():
            _, _, (answer_start, answer_end) = self._dataset.get_ground_truths(index)

            start_acc = 1 if preds["pred_span_start"] == answer_start else 0
            end_acc = 1 if preds["pred_span_end"] == answer_end else 0
            span_acc = 1 if start_acc == 1 and end_acc == 1 else 0

            start_accuracy += start_acc
            end_accuracy += end_acc
            span_accuracy += span_acc

        start_accuracy = 100.0 * start_accuracy / len(self._dataset)
        end_accuracy = 100.0 * end_accuracy / len(self._dataset)
        span_accuracy = 100.0 * span_accuracy / len(self._dataset)

        return {"start_acc": start_accuracy, "end_acc": end_accuracy, "span_acc": span_accuracy}

    def make_predictions(self, output_dict):
        """
        Make predictions with model's output_dict

        * Args:
            output_dict: model's output dictionary consisting of
                - data_idx: question id
                - best_span: calculate the span_start_logits and span_end_logits to what is the best span
                - start_logits: span start logits
                - end_logits: span end logits

        * Returns:
            predictions: prediction dictionary consisting of
                - key: 'id' (question id)
                - value: consisting of dictionary
                    predict_text, pred_span_start, pred_span_end, span_start_prob, span_end_prob
        """

        data_indices = output_dict["data_idx"]
        best_word_span = output_dict["best_span"]

        return OrderedDict(
            [
                (
                    index.item(),
                    {
                        "predict_text": self._dataset.get_text_with_index(
                            index.item(), best_span[0], best_span[1]
                        ),
                        "pred_span_start": best_span[0],
                        "pred_span_end": best_span[1],
                        "start_logits": start_logits,
                        "end_logits": end_logits,
                    },
                )
                for index, best_span, start_logits, end_logits in zip(
                    list(data_indices.data),
                    list(best_word_span.data),
                    list(output_dict["start_logits"].data),
                    list(output_dict["end_logits"].data),
                )
            ]
        )

    @arguments_required(["context", "question"])
    def predict(self, output_dict, arguments, helper):
        """
        Inference by raw_feature

        * Args:
            output_dict: model's output dictionary consisting of
                - data_idx: question id
                - best_span: calculate the span_start_logits and span_end_logits to what is the best span
            arguments: arguments dictionary consisting of user_input
            helper: dictionary for helping get answer

        * Returns:
            span: predict best_span
        """
        span_start, span_end = list(output_dict["best_span"][0].data)
        word_start = span_start.item()
        word_end = span_end.item()

        text_span = helper["text_span"]
        char_start = text_span[word_start][0]
        char_end = text_span[word_end][1]

        context_text = arguments["context"]
        answer_text = context_text[char_start:char_end]

        start_logit = output_dict["start_logits"][0]
        end_logit = output_dict["end_logits"][0]

        score = start_logit[span_start] + end_logit[span_end]
        score = score.item()

        return {"text": answer_text, "score": score}

    def print_examples(self, index, inputs, predictions):
        """
        Print evaluation examples

        * Args:
            index: data index
            inputs: mini-batch inputs
            predictions: prediction dictionary consisting of
                - key: 'id' (question id)
                - value: consisting of dictionary
                    predict_text, pred_span_start, pred_span_end, span_start_prob, span_end_prob

        * Returns:
            print(Context, Question, Answers and Predict)
        """
        data_index = inputs["labels"]["data_idx"][index].item()
        qid = self._dataset.get_qid(data_index)
        if "#" in qid:  # bert case (qid#index)
            qid = qid.split("#")[0]

        helper = self._dataset.helper

        context = helper["examples"][qid]["context"]
        question = helper["examples"][qid]["question"]
        answers = helper["examples"][qid]["answers"]

        predict_text = predictions[data_index]["predict_text"]

        print()
        print("- Context:", context)
        print("- Question:", question)
        print("- Answers:", answers)
        print("- Predict:", predict_text)
        print()

    def write_predictions(self, predictions, file_path=None, is_dict=True):
        pass
        # TODO: start and end logits (TypeError: Object of type 'Tensor' is not JSON serializable)
        # try:
            # super(ReadingComprehension, self).write_predictions(
                # predictions, file_path=file_path, is_dict=is_dict
            # )
        # except AttributeError:
            # # TODO: Need to Fix
            # model_base = ModelBase()
            # model_base._log_dir = self._log_dir
            # model_base._train_counter = self._train_counter
            # model_base.training = self.training
            # model_base.write_predictions(predictions, file_path=file_path, is_dict=is_dict)


class SQuADv1(ReadingComprehension):
    """
    Reading Comprehension Mixin Class
        with SQuAD v1.1 evaluation

    * Args:
        token_embedder: 'QATokenEmbedder', Used to embed the 'context' and 'question'.

    """

    def make_metrics(self, predictions):
        """
        Make metrics with prediction dictionary

        * Args:
            predictions: prediction dictionary consisting of
                - key: 'id' (question id)
                - value: (predict_text, pred_span_start, pred_span_end)

        * Returns:
            metrics: metric dictionary consisting of
                - 'em': exact_match (SQuAD v1.1 official evaluation)
                - 'f1': f1 (SQuAD v1.1 official evaluation)
                - 'start_acc': span_start accuracy
                - 'end_acc': span_end accuracy
                - 'span_acc': span accuracy (start and end)
        """

        preds = {}
        for index, prediction in predictions.items():
            _, _, (answer_start, answer_end) = self._dataset.get_ground_truths(index)

            qid = self._dataset.get_qid(index)
            preds[qid] = prediction["predict_text"]

        self.write_predictions(preds)

        squad_offical_metrics = self._make_metrics_with_official(preds)

        metrics = self._make_span_metrics(predictions)
        metrics.update(squad_offical_metrics)
        return metrics

    def _make_metrics_with_official(self, preds):
        """ SQuAD v1.1 official evaluation """
        dataset = self._dataset.raw_dataset

        if self.lang_code.startswith("ko"):
            scores = korquad_v1_official.evaluate(dataset, preds)
        else:
            scores = squad_v1_official.evaluate(dataset, preds)
        return scores


class SQuADv1ForBert(SQuADv1):
    """
    Reading Comprehension Mixin Class
        with SQuAD v1.1 evaluation

    * Args:
        token_embedder: 'QATokenEmbedder', Used to embed the 'context' and 'question'.

    """

    def make_metrics(self, predictions):
        """ BERT predictions need to get nbest result """

        best_predictions = {}
        for index, prediction in predictions.items():
            qid = self._dataset.get_qid(index)

            predict_text = prediction["predict_text"]

            start_logit = prediction["start_logits"][prediction["pred_span_start"]]
            end_logit = prediction["end_logits"][prediction["pred_span_end"]]
            predict_score = start_logit.item() + end_logit.item()

            if qid not in best_predictions:
                best_predictions[qid] = []
            best_predictions[qid].append((predict_text, predict_score))

        for qid, predictions in best_predictions.items():
            sorted_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
            best_predictions[qid] = sorted_predictions[0][0]

        self.write_predictions(best_predictions)
        return self._make_metrics_with_official(best_predictions)

    def predict(self, output_dict, arguments, helper):
        """
        Inference by raw_feature

        * Args:
            output_dict: model's output dictionary consisting of
                - data_idx: question id
                - best_span: calculate the span_start_logits and span_end_logits to what is the best span
            arguments: arguments dictionary consisting of user_input
            helper: dictionary for helping get answer

        * Returns:
            span: predict best_span
        """

        context_text = arguments["context"]
        bert_tokens = helper["bert_token"]
        predictions = [
            (best_span, start_logits, end_logits)
            for best_span, start_logits, end_logits in zip(
                list(output_dict["best_span"].data),
                list(output_dict["start_logits"].data),
                list(output_dict["end_logits"].data),
            )
        ]

        best_predictions = []
        for index, prediction in enumerate(predictions):
            bert_token = bert_tokens[index]
            best_span, start_logits, end_logits = prediction
            pred_start, pred_end = best_span

            predict_text = ""
            if (
                pred_start < len(bert_token)
                and pred_end < len(bert_token)
                and bert_token[pred_start].text_span is not None
                and bert_token[pred_end].text_span is not None
            ):
                char_start = bert_token[pred_start].text_span[0]
                char_end = bert_token[pred_end].text_span[1]
                predict_text = context_text[char_start:char_end]

            start_logit = start_logits[pred_start]
            end_logit = end_logits[pred_end]
            predict_score = start_logit.item() + end_logit.item()

            best_predictions.append((predict_text, predict_score))

        sorted_predictions = sorted(best_predictions, key=lambda x: x[1], reverse=True)
        return {"text": sorted_predictions[0][0], "score": sorted_predictions[0][1]}


class SQuADv2(ReadingComprehension):
    """
    Reading Comprehension Mixin Class
        with SQuAD v2.0 evaluation

    * Args:
        token_embedder: 'RCTokenEmbedder', Used to embed the 'context' and 'question'.

    """

    def make_metrics(self, predictions):
        """
        Make metrics with prediction dictionary

        * Args:
            predictions: prediction dictionary consisting of
                - key: 'id' (question id)
                - value: consisting of dictionary
                    predict_text, pred_span_start, pred_span_end, span_start_prob, span_end_prob

        * Returns:
            metrics: metric dictionary consisting of
                - 'start_acc': span_start accuracy
                - 'end_acc': span_end accuracy
                - 'span_acc': span accuracy (start and end)
                - 'em': exact_match (SQuAD v2.0 official evaluation)
                - 'f1': f1 (SQuAD v2.0 official evaluation)
                - 'HasAns_exact': has answer exact_match
                - 'HasAns_f1': has answer f1
                - 'NoAns_exact': no answer exact_match
                - 'NoAns_f1': no answer f1
                - 'best_exact': best exact_match score with best_exact_thresh
                - 'best_exact_thresh': best exact_match answerable threshold
                - 'best_f1': best f1 score with best_f1_thresh
                - 'best_f1_thresh': best f1 answerable threshold
        """

        preds, na_probs = {}, {}
        for index, prediction in predictions.items():
            _, _, (answer_start, answer_end) = self._dataset.get_ground_truths(index)

            # Metrics (SQuAD official metric)
            predict_text = prediction["predict_text"]
            if predict_text == "<noanswer>":
                predict_text = ""

            qid = self._dataset.get_qid(index)
            preds[qid] = predict_text

            span_start_probs = F.softmax(prediction["start_logits"], dim=-1)
            span_end_probs = F.softmax(prediction["end_logits"], dim=-1)

            start_no_prob = span_start_probs[-1].item()
            end_no_prob = span_end_probs[-1].item()
            no_answer_prob = start_no_prob * end_no_prob
            na_probs[qid] = no_answer_prob

        self.write_predictions(preds)

        model_type = "train" if self.training else "valid"
        self.write_predictions(
            na_probs, file_path=f"na_probs-{model_type}-{self._train_counter.get_display()}.json"
        )

        squad_offical_metrics = self._make_metrics_with_official(preds, na_probs)

        metrics = self._make_span_metrics(predictions)
        metrics.update(squad_offical_metrics)
        return metrics

    def _make_metrics_with_official(self, preds, na_probs, na_prob_thresh=1.0):
        """ SQuAD 2.0 official evaluation """
        dataset = self._dataset.raw_dataset

        squad_scores = squad_v2_official.evaluate(dataset, na_probs, preds)
        squad_scores["em"] = squad_scores["exact"]

        remove_keys = ["total", "exact", "HasAns_total", "NoAns_total"]
        for key in remove_keys:
            if key in squad_scores:
                del squad_scores[key]

        return squad_scores