python source code of nq_long

language-master
- AUTHORS
- language
  - labs
    - exemplar_decoding
      - utils
        data.py
        tensor_utils_test.py
        rouge_utils.py
        __init__.py
        tensor_utils.py
      - models
        adam.py
        hyperlstm.py
        attention.py
        model_function.py
        common.py
        __init__.py
        linear.py
        output_wrapper.py
        baselines.py
        hypernet.py
      - experiments
        predict.py
        train.py
        __init__.py
      - __init__.py
      - docs
        nyt_hyperparameters.txt
        giga_hyperparameters.txt
    - drkit
      - run_dualencoder_lsf.py
      - input_fns.py
      - wikidata
        index.py
        scripts
        index_wikidata_corpus.sh
        run_wikidata_pretraining.sh
        run_wikidata_finetuning.sh
        create_multihop_wikidata.sh
        preprocessing
        create_3hop_queries.py
        distantly_supervise.py
        add_negatives.py
        create_follow_queries.py
      - bert_utils.py
      - evaluate.py
      - bert_utils_v2.py
      - run_multihop_follow.py
      - search_utils.py
      - run_dualencoder_qa.py
      - model_fns.py
      - README.md
      - metaqa
        index.py
        scripts
        run_metaqa_pretraining.sh
        preprocess_data.sh
        index_metaqa_corpus.sh
        run_metaqa_finetuning.sh
        preprocessing
        distantly_supervise.py
        metaqa_preprocess.py
        process_wiki.py
      - preprocessing
        preprocess_qa.py
        preprocess_utils.py
      - hotpotqa
        index.py
        web
        templates
        drkit.html
        static
        drkit.css
        answer_extractor.py
        demo.py
        scripts
        run_hotpotqa_answer.sh
        index_hotpot_corpus.sh
        run_demo.sh
        run_hotpotqa_finetuning.sh
        run_hotpotqa_pretraining.sh
        preprocessing
        convert_wikidata_to_mrqa.py
        create_tfrecords.py
        convert_hotpot_to_mrqa.py
        parse_wiki.py
        link_questions.py
    - consistent_zero_shot_nmt
      - utils
        __init__.py
        t2t_tweaks.py
        model_utils.py
        common_utils.py
      - models
        basic.py
        agreement.py
        losses.py
        __init__.py
        losses_test.py
      - data_generators
        translate_europarl.py
        translate_iwslt17.py
        translate_uncorpus.py
        __init__.py
        translate_multilingual.py
      - __init__.py
      - README.md
      - scripts
        decode_europarl.sh
        download_and_preproc_europarl.sh
        datagen_europarl.sh
        decode_iwslt17.sh
        identify_overlap_europarl.py
        run_nmt_experiment.sh
        datagen_uncorpus.sh
        __init__.py
        decode_uncorpus.sh
        datagen_iwslt17.sh
        parse-args.sh
        identify_overlap_iwslt17.py
        download_and_preproc_iwslt2017.sh
      - bin
        t2t_trainer.py
        t2t_datagen.py
        __init__.py
        t2t_decoder.py
      - modules
        attention_mechanisms.py
        attention_wrappers.py
        decoders.py
        __init__.py
        language_models.py
        encoders.py
        helpers.py
        base.py
    - README
    - __init__.py
    - memory
      - explicit_mem.py
      - synthetic_dataset_test.py
      - differentiable_plasticity.py
      - README
      - baseline_models.py
      - model_utils_test.py
      - synthetic_dataset.py
      - __init__.py
      - run_models.py
      - model_utils.py
  - boolq
    - utils
      - ops.py
      - py_utils.py
      - __init__.py
      - tokenization.py
      - ops_test.py
      - best_checkpoint_exporter.py
    - run_recurrent_model_boolq.py
    - run_bert_boolq.py
    - __init__.py
    - README.md
  - tek_representations
    - run_pretraining.py
    - utils
      - util.py
      - triviaqa_evaluation.py
      - __init__.py
      - mrqa_official_eval.py
    - run_mrqa.py
    - __init__.py
    - README.md
    - background.py
    - tokenization.py
    - preprocess
      - create_pretraining_data.py
      - count_features.py
      - __init__.py
      - triviaqa_to_mrqa.py
      - prepare_mrqa_data.py
  - common
    - utils
      - tpu_utils.py
      - file_utils.py
      - exporters.py
      - tensor_utils_test.py
      - __init__.py
      - tensor_utils.py
      - experiment_utils_test.py
      - model_utils.py
      - experiment_utils.py
    - layers
      - cudnn_layers.py
      - cudnn_layers_test.py
      - common_layers.py
      - __init__.py
      - common_layers_test.py
    - inputs
      - dataset_utils.py
      - dataset_utils_test.py
      - char_utils_test.py
      - embedding_utils_test.py
      - char_utils.py
      - embedding_utils.py
      - __init__.py
    - __init__.py
  - relation_learning
    - models
      - bert_fewshot_classifier.py
      - __init__.py
    - data
      - fewrel.py
      - __init__.py
    - __init__.py
  - totto
    - eval_requirements.txt
    - table_to_text_utils.py
    - sample
      - output_sample.txt
      - dev_sample.jsonl
      - train_sample.jsonl
      - example-4.html
      - example-0.html
      - example-1.html
      - example-3.html
      - example-2.html
    - table_to_text_utils_test.py
    - create_table_to_text_html.py
    - eval_pipeline_test.py
    - baseline_preprocessing
      - preprocess_utils.py
      - preprocess_data_main.py
      - __init__.py
    - table_to_text_html_utils.py
    - totto_eval.sh
    - prepare_references_for_eval.py
    - prepare_predictions_for_eval.py
    - totto_parent_eval.py
    - __init__.py
    - README.md
  - nql
    - nql
      - nql_test.py
      - util.py
      - dist.py
      - symbol.py
      - util_test.py
      - symbol_test.py
      - io.py
      - io_test.py
      - __init__.py
      - dataset.py
      - dataset_test.py
      - nql_test_lib.py
    - setup.py
    - demos
      - nell995
        preprocess_data.py
        README.txt
        nell995.py
      - data
        royal92
        fathers.tsv
        README.md
      - gridworld_scaling
        plot_figure1.py
        figure1.bash
        README.txt
        __init__.py
        scaling_eval.py
        gendata_figure1.bash
      - Introduction_to_NQL.ipynb
      - metaqa
        preprocess_data.py
        README.txt
        metaqa.py
  - bert_extraction
    - steal_bert_classifier
      - utils
        preprocess_distill_input_watermark.py
        model_diff_dataset.py
        dataset_analysis.py
        verify_watermark.py
        model_diff.py
        wiki103_sentencize.py
        merge_datasets_simple.py
        pairwise_dataset_analysis.py
        __init__.py
        preprocess_distill_input.py
      - embedding_perturbations
        mixup_bert_embeddings.py
        invert_embeddings.py
        __init__.py
        discrete_invert_embeddings.py
        merge_shards.py
        embedding_util.py
      - models
        run_classifier_distillation.py
        run_classifier.py
        run_classifier_membership.py
        __init__.py
      - data_generation
        preprocess_random.py
        preprocess_util.py
        merge_dataset_pool_active_learning.py
        build_aux_membership.py
        __init__.py
        preprocess_thief_dataset.py
        preprocess_edit_distance_one.py
        build_membership_dataset.py
      - __init__.py
      - README.md
      - scripts
        run_membership_classification.sh
        run_query_synthesis.sh
        run_extraction_watermark_wiki.sh
        run_extraction_wiki.sh
        run_extraction_random.sh
        evaluate_agreement.sh
        train_victim.sh
        run_pool_filter.sh
        run_extraction_watermark_random.sh
    - steal_bert_qa
      - utils
        combine_qa.py
        evaluate_squad_watermark.py
        evaluate_squad_2.py
        evaluate_squad.py
        filter_queries_victim_agreement.py
        combine_qa_watermark.py
        __init__.py
        run_bert_boolq_diff.py
        wiki103_para_split.py
      - models
        run_squad.py
        run_bert_boolq.py
        __init__.py
        run_bert_boolq_distill.py
        run_squad_membership.py
      - data_generation
        preprocess_thief_dataset_boolq.py
        preprocess_thief_dataset_squad.py
        preprocess_thief_dataset_squad_custom.py
        preprocess_fraction_squad.py
        preprocess_util.py
        build_aux_membership.py
        __init__.py
        build_membership_dataset.py
        preprocess_thief_dev_squad.py
      - __init__.py
      - README.md
      - scripts
        run_extraction_squad.sh
        run_filter_victim_squad.sh
        run_extraction_watermark_squad.sh
        train_victim_boolq.sh
        run_membership_squad.sh
        run_extraction_boolq.sh
        train_victim_squad.sh
    - __init__.py
    - README.md
  - __init__.py
  - question_answering
    - bert_joint
      - run_nq_test.py
      - prepare_nq_data.py
      - run_nq.py
      - __init__.py
      - README.md
    - b2t2
      - compute_vcr_features.py
      - run_dual_encoder.py
      - run_b2t2.py
      - README.md
      - requirements.txt
    - __init__.py
    - decatt_docreader
      - datasets
        nq_long_dataset.py
        nq_short_pipeline_dataset.py
        __init__.py
      - utils
        span_utils.py
        nq_long_utils.py
        span_utils_test.py
        nq_long_utils_test.py
        __init__.py
      - layers
        decomposable_attention_test.py
        document_reader_test.py
        document_reader.py
        __init__.py
        decomposable_attention.py
      - models
        nq_long_decatt_model.py
        __init__.py
        nq_short_pipeline_model.py
        nq_long_model.py
      - experiments
        nq_short_pipeline_experiment.py
        nq_export_scorer.py
        nq_long_experiment.py
        __init__.py
      - __init__.py
      - README.md
      - preprocessing
        __init__.py
        create_nq_long_examples.py
        create_nq_short_pipeline_examples.py
  - table_text_eval
    - preprocess_webnlg.py
    - webnlg_correlations.py
    - wikibio_correlations.py
    - table_text_eval_test.py
    - __init__.py
    - README.md
    - table_text_eval.py
  - xsp
    - training
      - train_model.py
    - evaluation
      - official_evaluation.py
      - official_evaluation_test.py
      - restore_from_asql.py
      - filter_results.py
      - restore_from_asql_main.py
      - convert_preds_for_spider_eval.py
    - data_preprocessing
      - create_vocabularies.py
      - language_utils.py
      - sql_parsing.py
      - sql_utils.py
      - compute_asql_coverage_spider.py
      - convert_to_examples.py
      - michigan_preprocessing.py
      - schema_utils.py
      - estimate_asql_coverage_michigan.py
      - abstract_sql.py
      - spider_preprocessing.py
      - sqlparse_keyword_utils.py
      - abstract_sql_converters.py
      - wikisql_preprocessing.py
      - convert_to_tfrecords.py
      - abstract_sql_main.py
      - nl_to_sql_example.py
      - abstract_sql_test.py
    - model
      - decode_utils.py
      - tpu_utils.py
      - model_config.json
      - sequence_example_decoder.py
      - transformer.py
      - input_utils.py
      - bert_utils.py
      - metrics.py
      - loss.py
      - common_layers.py
      - adam_weight_decay.py
      - load_from_checkpoint.py
      - constants.py
      - input_pipeline.py
      - model_config.py
      - embeddings.py
      - run_inference.py
      - local_model_config.json
      - beam_search.py
      - model_builder.py
  - conpono
    - create_pretrain_data
      - wiki_preproc_pipeline.py
      - preprocessing_utils.py
      - books_preproc_pipeline.py
    - binary_order
      - run_binary_coherence.py
    - README.md
    - cpc
      - run_cpc.py
      - run_bilin_cpc.py
      - preproc
        ccnews_preproc_pipeline.py
        wiki_preproc_pipeline.py
        preprocessing_utils.py
        raw_books_preproc_pipeline.py
        books_preproc_pipeline.py
      - bilin_model_builder.py
      - model_builder.py
      - run_cc_cpc.py
    - reconstruct
      - preprocess.py
      - run_paragraph_reconstruct.py
      - model_builder.py
    - evals
      - run_record.py
      - run_finetune_coherence.py
      - race_utils.py
      - run_squad.py
      - classifier_utils.py
      - run_classifier.py
      - coherence_eval.py
      - run_race_sp_eval_all.py
      - run_concat_classifier.py
      - run_multichoice.py
      - discriminative_eval.py
      - run_hellaswag.py
      - model_builder.py
  - orqa
    - utils
      - eval_utils.py
      - __init__.py
    - evaluation
      - evaluate_predictions.py
      - __init__.py
    - __init__.py
    - README.md
    - preprocessing
      - convert_to_nq_open.py
      - create_data_splits.py
      - __init__.py
- LICENSE
- CONTRIBUTING.md
- setup.py
- README.md

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset from the Natural Questions long answer task.

Fields:
  `question`: <string> [question_len]; tokens in the question.
  `context`: <string> [num_candidates, context_len]; tokens in each candidate.
  'answer_indices': <int32>[num_annotations]: answer indicated by each
  annotator.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import flags

import tensorflow.compat.v1 as tf

flags.DEFINE_string("nq_long_train_pattern", None,
                    "Path to NQ long answer training data.")

flags.DEFINE_string("nq_long_eval_pattern", None,
                    "Path to NQ long answer eval data.")

FLAGS = flags.FLAGS


def split_on_whitespace(str_tensor):
  return tf.string_split(tf.expand_dims(str_tensor, -1)).values


def parse_example(serialized_example):
  """Parse example."""
  features = tf.parse_single_example(
      serialized_example,
      features={
          "question":
              tf.FixedLenFeature([], tf.string),
          "context":
              tf.FixedLenSequenceFeature(
                  dtype=tf.string, shape=[], allow_missing=True),
          "long_answer_indices":
              tf.FixedLenSequenceFeature(
                  dtype=tf.int64, shape=[], allow_missing=True)
      })
  features["question"] = features["question"]
  features["context"] = features["context"]
  features["long_answer_indices"] = tf.to_int32(features["long_answer_indices"])
  return features


def get_dataset(is_train):
  """Gets a tf.data.Dataset representing the NQ data."""
  if is_train:
    data_pattern = FLAGS.nq_long_train_pattern
  else:
    data_pattern = FLAGS.nq_long_eval_pattern

  data_files = tf.gfile.Glob(data_pattern)
  assert data_files

  def _load_records(filenames):
    return tf.data.TFRecordDataset(filenames, buffer_size=16 * 1024)

  if is_train:
    # During training, read from all files in parallel to improve the speed of
    # the input pipeline.
    dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
    dataset = dataset.apply(
        tf.contrib.data.shuffle_and_repeat(buffer_size=len(data_files)))
    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(
            _load_records, sloppy=is_train, cycle_length=len(data_files)))
  else:
    dataset = _load_records(data_files)
  dataset = dataset.map(parse_example, num_parallel_calls=6)
  return dataset