python source code of raw_books_preproc

language-master
- AUTHORS
- language
  - labs
    - exemplar_decoding
      - utils
        data.py
        tensor_utils_test.py
        rouge_utils.py
        __init__.py
        tensor_utils.py
      - models
        adam.py
        hyperlstm.py
        attention.py
        model_function.py
        common.py
        __init__.py
        linear.py
        output_wrapper.py
        baselines.py
        hypernet.py
      - experiments
        predict.py
        train.py
        __init__.py
      - __init__.py
      - docs
        nyt_hyperparameters.txt
        giga_hyperparameters.txt
    - drkit
      - run_dualencoder_lsf.py
      - input_fns.py
      - wikidata
        index.py
        scripts
        index_wikidata_corpus.sh
        run_wikidata_pretraining.sh
        run_wikidata_finetuning.sh
        create_multihop_wikidata.sh
        preprocessing
        create_3hop_queries.py
        distantly_supervise.py
        add_negatives.py
        create_follow_queries.py
      - bert_utils.py
      - evaluate.py
      - bert_utils_v2.py
      - run_multihop_follow.py
      - search_utils.py
      - run_dualencoder_qa.py
      - model_fns.py
      - README.md
      - metaqa
        index.py
        scripts
        run_metaqa_pretraining.sh
        preprocess_data.sh
        index_metaqa_corpus.sh
        run_metaqa_finetuning.sh
        preprocessing
        distantly_supervise.py
        metaqa_preprocess.py
        process_wiki.py
      - preprocessing
        preprocess_qa.py
        preprocess_utils.py
      - hotpotqa
        index.py
        web
        templates
        drkit.html
        static
        drkit.css
        answer_extractor.py
        demo.py
        scripts
        run_hotpotqa_answer.sh
        index_hotpot_corpus.sh
        run_demo.sh
        run_hotpotqa_finetuning.sh
        run_hotpotqa_pretraining.sh
        preprocessing
        convert_wikidata_to_mrqa.py
        create_tfrecords.py
        convert_hotpot_to_mrqa.py
        parse_wiki.py
        link_questions.py
    - consistent_zero_shot_nmt
      - utils
        __init__.py
        t2t_tweaks.py
        model_utils.py
        common_utils.py
      - models
        basic.py
        agreement.py
        losses.py
        __init__.py
        losses_test.py
      - data_generators
        translate_europarl.py
        translate_iwslt17.py
        translate_uncorpus.py
        __init__.py
        translate_multilingual.py
      - __init__.py
      - README.md
      - scripts
        decode_europarl.sh
        download_and_preproc_europarl.sh
        datagen_europarl.sh
        decode_iwslt17.sh
        identify_overlap_europarl.py
        run_nmt_experiment.sh
        datagen_uncorpus.sh
        __init__.py
        decode_uncorpus.sh
        datagen_iwslt17.sh
        parse-args.sh
        identify_overlap_iwslt17.py
        download_and_preproc_iwslt2017.sh
      - bin
        t2t_trainer.py
        t2t_datagen.py
        __init__.py
        t2t_decoder.py
      - modules
        attention_mechanisms.py
        attention_wrappers.py
        decoders.py
        __init__.py
        language_models.py
        encoders.py
        helpers.py
        base.py
    - README
    - __init__.py
    - memory
      - explicit_mem.py
      - synthetic_dataset_test.py
      - differentiable_plasticity.py
      - README
      - baseline_models.py
      - model_utils_test.py
      - synthetic_dataset.py
      - __init__.py
      - run_models.py
      - model_utils.py
  - boolq
    - utils
      - ops.py
      - py_utils.py
      - __init__.py
      - tokenization.py
      - ops_test.py
      - best_checkpoint_exporter.py
    - run_recurrent_model_boolq.py
    - run_bert_boolq.py
    - __init__.py
    - README.md
  - tek_representations
    - run_pretraining.py
    - utils
      - util.py
      - triviaqa_evaluation.py
      - __init__.py
      - mrqa_official_eval.py
    - run_mrqa.py
    - __init__.py
    - README.md
    - background.py
    - tokenization.py
    - preprocess
      - create_pretraining_data.py
      - count_features.py
      - __init__.py
      - triviaqa_to_mrqa.py
      - prepare_mrqa_data.py
  - common
    - utils
      - tpu_utils.py
      - file_utils.py
      - exporters.py
      - tensor_utils_test.py
      - __init__.py
      - tensor_utils.py
      - experiment_utils_test.py
      - model_utils.py
      - experiment_utils.py
    - layers
      - cudnn_layers.py
      - cudnn_layers_test.py
      - common_layers.py
      - __init__.py
      - common_layers_test.py
    - inputs
      - dataset_utils.py
      - dataset_utils_test.py
      - char_utils_test.py
      - embedding_utils_test.py
      - char_utils.py
      - embedding_utils.py
      - __init__.py
    - __init__.py
  - relation_learning
    - models
      - bert_fewshot_classifier.py
      - __init__.py
    - data
      - fewrel.py
      - __init__.py
    - __init__.py
  - totto
    - eval_requirements.txt
    - table_to_text_utils.py
    - sample
      - output_sample.txt
      - dev_sample.jsonl
      - train_sample.jsonl
      - example-4.html
      - example-0.html
      - example-1.html
      - example-3.html
      - example-2.html
    - table_to_text_utils_test.py
    - create_table_to_text_html.py
    - eval_pipeline_test.py
    - baseline_preprocessing
      - preprocess_utils.py
      - preprocess_data_main.py
      - __init__.py
    - table_to_text_html_utils.py
    - totto_eval.sh
    - prepare_references_for_eval.py
    - prepare_predictions_for_eval.py
    - totto_parent_eval.py
    - __init__.py
    - README.md
  - nql
    - nql
      - nql_test.py
      - util.py
      - dist.py
      - symbol.py
      - util_test.py
      - symbol_test.py
      - io.py
      - io_test.py
      - __init__.py
      - dataset.py
      - dataset_test.py
      - nql_test_lib.py
    - setup.py
    - demos
      - nell995
        preprocess_data.py
        README.txt
        nell995.py
      - data
        royal92
        fathers.tsv
        README.md
      - gridworld_scaling
        plot_figure1.py
        figure1.bash
        README.txt
        __init__.py
        scaling_eval.py
        gendata_figure1.bash
      - Introduction_to_NQL.ipynb
      - metaqa
        preprocess_data.py
        README.txt
        metaqa.py
  - bert_extraction
    - steal_bert_classifier
      - utils
        preprocess_distill_input_watermark.py
        model_diff_dataset.py
        dataset_analysis.py
        verify_watermark.py
        model_diff.py
        wiki103_sentencize.py
        merge_datasets_simple.py
        pairwise_dataset_analysis.py
        __init__.py
        preprocess_distill_input.py
      - embedding_perturbations
        mixup_bert_embeddings.py
        invert_embeddings.py
        __init__.py
        discrete_invert_embeddings.py
        merge_shards.py
        embedding_util.py
      - models
        run_classifier_distillation.py
        run_classifier.py
        run_classifier_membership.py
        __init__.py
      - data_generation
        preprocess_random.py
        preprocess_util.py
        merge_dataset_pool_active_learning.py
        build_aux_membership.py
        __init__.py
        preprocess_thief_dataset.py
        preprocess_edit_distance_one.py
        build_membership_dataset.py
      - __init__.py
      - README.md
      - scripts
        run_membership_classification.sh
        run_query_synthesis.sh
        run_extraction_watermark_wiki.sh
        run_extraction_wiki.sh
        run_extraction_random.sh
        evaluate_agreement.sh
        train_victim.sh
        run_pool_filter.sh
        run_extraction_watermark_random.sh
    - steal_bert_qa
      - utils
        combine_qa.py
        evaluate_squad_watermark.py
        evaluate_squad_2.py
        evaluate_squad.py
        filter_queries_victim_agreement.py
        combine_qa_watermark.py
        __init__.py
        run_bert_boolq_diff.py
        wiki103_para_split.py
      - models
        run_squad.py
        run_bert_boolq.py
        __init__.py
        run_bert_boolq_distill.py
        run_squad_membership.py
      - data_generation
        preprocess_thief_dataset_boolq.py
        preprocess_thief_dataset_squad.py
        preprocess_thief_dataset_squad_custom.py
        preprocess_fraction_squad.py
        preprocess_util.py
        build_aux_membership.py
        __init__.py
        build_membership_dataset.py
        preprocess_thief_dev_squad.py
      - __init__.py
      - README.md
      - scripts
        run_extraction_squad.sh
        run_filter_victim_squad.sh
        run_extraction_watermark_squad.sh
        train_victim_boolq.sh
        run_membership_squad.sh
        run_extraction_boolq.sh
        train_victim_squad.sh
    - __init__.py
    - README.md
  - __init__.py
  - question_answering
    - bert_joint
      - run_nq_test.py
      - prepare_nq_data.py
      - run_nq.py
      - __init__.py
      - README.md
    - b2t2
      - compute_vcr_features.py
      - run_dual_encoder.py
      - run_b2t2.py
      - README.md
      - requirements.txt
    - __init__.py
    - decatt_docreader
      - datasets
        nq_long_dataset.py
        nq_short_pipeline_dataset.py
        __init__.py
      - utils
        span_utils.py
        nq_long_utils.py
        span_utils_test.py
        nq_long_utils_test.py
        __init__.py
      - layers
        decomposable_attention_test.py
        document_reader_test.py
        document_reader.py
        __init__.py
        decomposable_attention.py
      - models
        nq_long_decatt_model.py
        __init__.py
        nq_short_pipeline_model.py
        nq_long_model.py
      - experiments
        nq_short_pipeline_experiment.py
        nq_export_scorer.py
        nq_long_experiment.py
        __init__.py
      - __init__.py
      - README.md
      - preprocessing
        __init__.py
        create_nq_long_examples.py
        create_nq_short_pipeline_examples.py
  - table_text_eval
    - preprocess_webnlg.py
    - webnlg_correlations.py
    - wikibio_correlations.py
    - table_text_eval_test.py
    - __init__.py
    - README.md
    - table_text_eval.py
  - xsp
    - training
      - train_model.py
    - evaluation
      - official_evaluation.py
      - official_evaluation_test.py
      - restore_from_asql.py
      - filter_results.py
      - restore_from_asql_main.py
      - convert_preds_for_spider_eval.py
    - data_preprocessing
      - create_vocabularies.py
      - language_utils.py
      - sql_parsing.py
      - sql_utils.py
      - compute_asql_coverage_spider.py
      - convert_to_examples.py
      - michigan_preprocessing.py
      - schema_utils.py
      - estimate_asql_coverage_michigan.py
      - abstract_sql.py
      - spider_preprocessing.py
      - sqlparse_keyword_utils.py
      - abstract_sql_converters.py
      - wikisql_preprocessing.py
      - convert_to_tfrecords.py
      - abstract_sql_main.py
      - nl_to_sql_example.py
      - abstract_sql_test.py
    - model
      - decode_utils.py
      - tpu_utils.py
      - model_config.json
      - sequence_example_decoder.py
      - transformer.py
      - input_utils.py
      - bert_utils.py
      - metrics.py
      - loss.py
      - common_layers.py
      - adam_weight_decay.py
      - load_from_checkpoint.py
      - constants.py
      - input_pipeline.py
      - model_config.py
      - embeddings.py
      - run_inference.py
      - local_model_config.json
      - beam_search.py
      - model_builder.py
  - conpono
    - create_pretrain_data
      - wiki_preproc_pipeline.py
      - preprocessing_utils.py
      - books_preproc_pipeline.py
    - binary_order
      - run_binary_coherence.py
    - README.md
    - cpc
      - run_cpc.py
      - run_bilin_cpc.py
      - preproc
        ccnews_preproc_pipeline.py
        wiki_preproc_pipeline.py
        preprocessing_utils.py
        raw_books_preproc_pipeline.py
        books_preproc_pipeline.py
      - bilin_model_builder.py
      - model_builder.py
      - run_cc_cpc.py
    - reconstruct
      - preprocess.py
      - run_paragraph_reconstruct.py
      - model_builder.py
    - evals
      - run_record.py
      - run_finetune_coherence.py
      - race_utils.py
      - run_squad.py
      - classifier_utils.py
      - run_classifier.py
      - coherence_eval.py
      - run_race_sp_eval_all.py
      - run_concat_classifier.py
      - run_multichoice.py
      - discriminative_eval.py
      - run_hellaswag.py
      - model_builder.py
  - orqa
    - utils
      - eval_utils.py
      - __init__.py
    - evaluation
      - evaluate_predictions.py
      - __init__.py
    - __init__.py
    - README.md
    - preprocessing
      - convert_to_nq_open.py
      - create_data_splits.py
      - __init__.py
- LICENSE
- CONTRIBUTING.md
- setup.py
- README.md

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Beam pipeline to convert CC News to shareded TFRecords."""

from __future__ import absolute_import
from __future__ import division

from __future__ import print_function

import collections
import os
import random
from absl import app
from absl import flags
import apache_beam as beam
from bert import tokenization
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
import tensorflow.compat.v1 as tf

flags.DEFINE_string(
    "input_file", None, "Path to raw input files."
    "Assumes the filenames wiki.{train|valid|test}.raw")
flags.DEFINE_string("output_file", None, "Output TF example file.")
flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")
flags.DEFINE_integer("max_sent_length", 70, "Maximum sequence length.")
flags.DEFINE_integer("max_para_length", 30, "Maximum sequence length.")
flags.DEFINE_integer("random_seed", 12345, "A random seed")
flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")
flags.DEFINE_float("test_size", 0.1,
                   "Size of test set by factor of total dataset.")

FLAGS = flags.FLAGS



def split_line_by_sentences(line):
  return sent_tokenize(line)


def read_file(filename):
  """Read the contents of filename (str) and split into documents by chapter."""

  all_documents = []
  document = []
  with tf.gfile.GFile(filename, "r") as reader:
    for line in reader:
      line = line.strip()
      line = tokenization.convert_to_unicode(line)
      line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
      sents = split_line_by_sentences(line)
      for sent_line in sents:
        if not sent_line or len(sent_line) < 4:  # Arbitrary min length for line
          continue
        if sent_line.lower()[:7] == "chapter":
          if document:
            all_documents.append(document)
            document = []
        else:
          document.append(sent_line)
        if len(document) == FLAGS.max_para_length:
          all_documents.append(document)
          document = []
    if document:
      all_documents.append(document)

  # Remove small documents
  all_documents = [x for x in all_documents if len(x) >= 8]

  return all_documents


def create_bytes_feature(value):
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def create_int_feature(values):
  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
  return feature


def convert_instance_to_tf_example(tokenizer, sent_tokens, max_sent_length,
                                   max_para_length):
  """Convert a list of strings into a tf.Example."""

  input_ids_list = [
      tokenizer.convert_tokens_to_ids(tokens) for tokens in sent_tokens
  ]
  features = collections.OrderedDict()

  # pack or trim sentences to max_sent_length
  # pack paragraph to max_para_length
  sent_tensor = []
  for i in range(max_para_length):
    if i >= len(input_ids_list):
      sent_tensor.append([0] * max_sent_length)
    else:
      padded_ids = np.pad(
          input_ids_list[i], (0, max_sent_length),
          mode="constant")[:max_sent_length]
      sent_tensor.append(padded_ids)
  sent_tensor = np.ravel(np.stack(sent_tensor))
  features["sents"] = create_int_feature(sent_tensor)

  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
  return tf_example


def preproc_doc(document):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a CCNews article (ie. a list of sentences)

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # document = [
  #     tokenization.convert_to_unicode(
  #         unidecode.unidecode(line.decode("utf-8"))) for line in document
  # ]

  sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
  sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
  if len(sent_tokens) < 8:
    return []

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
                                              FLAGS.max_sent_length,
                                              FLAGS.max_para_length)

  # Serialize TFExample for writing to file.
  tf_examples = [tf_example.SerializeToString()]

  return tf_examples


def ccnews_pipeline():
  """Read Books Corpus filenames and create Beam pipeline."""

  # set a random seed for reproducability
  rng = random.Random(FLAGS.random_seed)

  # BooksCorpus is organized into directories of genre and files of books
  # adventure-all.txt seems to contain all the adventure books in 1 file
  # romance-all.txt is the same. None of the other directories have this,
  # so we will skip it to not double count those books
  file_name_set = set()
  input_files_by_genre = collections.defaultdict(list)
  for path, _, fnames in tf.gfile.Walk(FLAGS.input_file):
    genre = path.split("/")[-1]
    for fname in fnames:
      if fname == "adventure-all.txt" or fname == "romance-all.txt":
        continue
      if fname in file_name_set:
        continue
      file_name_set.add(fname)
      input_files_by_genre[genre].append(path + "/" + fname)

  # Sort genres and iterate in order for reproducability
  train_files, test_files = [], []
  for genre, file_list in sorted(input_files_by_genre.items()):
    rng.shuffle(file_list)
    genre_size = len(file_list)
    test_size = int(FLAGS.test_size * genre_size)
    test_files.extend(file_list[:test_size])
    train_files.extend(file_list[test_size:])
    assert len(file_list[:test_size]) + \
        len(file_list[test_size:]) == len(file_list)

  # make sure there is no test train overlap
  for filename in train_files:
    assert filename not in test_files

  rng.shuffle(train_files)
  rng.shuffle(test_files)

  def pipeline(root):
    """Beam pipeline for converting CCNews files to TF Examples."""
    _ = (
        root | "Create test files" >> beam.Create(test_files)
        | "Read test files" >> beam.FlatMap(read_file)
        | "test Shuffle" >> beam.Reshuffle()
        | "Preproc test docs" >> beam.FlatMap(preproc_doc)
        | "record test Shuffle" >> beam.Reshuffle()
        | "Write to test tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.test.tfrecord", num_shards=50))
    _ = (
        root | "Create train files" >> beam.Create(train_files)
        | "Read train files" >> beam.FlatMap(read_file)
        | "train Shuffle" >> beam.Reshuffle()
        | "Preproc train docs" >> beam.FlatMap(preproc_doc)
        | "record train Shuffle" >> beam.Reshuffle()
        | "Write to train tfrecord" >> beam.io.WriteToTFRecord(
            FLAGS.output_file + ".cc_cpc.train.tfrecord", num_shards=450))
    return

  return pipeline


def main(_):
  # If using Apache BEAM, execute runner here.

if __name__ == "__main__":
  app.run(main)