# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Script to apply `text_utils.simplify_nq_data` to all examples in a split.

We have provided the processed training set at the link below.

https://storage.cloud.google.com/natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz

The test set, used by NQ's competition website, is only provided in the original
NQ format. If you wish to use the simplified format, then you should call
`text_utils.simplify_nq_data` in your submitted system.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import glob
import gzip
import json
import os
import time

from absl import app
from absl import flags

import text_utils as text_utils

FLAGS = flags.FLAGS

flags.DEFINE_string(
    "data_dir", None, "Path to directory containing original NQ"
    "files, matching the pattern `nq-<split>-??.jsonl.gz`.")


def main(_):
  """Runs `text_utils.simplify_nq_example` over all shards of a split.

  Prints simplified examples to a single gzipped file in the same directory
  as the input shards.
  """
  split = os.path.basename(FLAGS.data_dir)
  outpath = os.path.join(FLAGS.data_dir,
                         "simplified-nq-{}.jsonl.gz".format(split))
  with gzip.open(outpath, "wb") as fout:
    num_processed = 0
    start = time.time()
    for inpath in glob.glob(os.path.join(FLAGS.data_dir, "nq-*-??.jsonl.gz")):
      print("Processing {}".format(inpath))
      with gzip.open(inpath, "rb") as fin:
        for l in fin:
          utf8_in = l.decode("utf8", "strict")
          utf8_out = json.dumps(
              text_utils.simplify_nq_example(json.loads(utf8_in))) + u"\n"
          fout.write(utf8_out.encode("utf8"))
          num_processed += 1
          if not num_processed % 100:
            print("Processed {} examples in {}.".format(num_processed,
                                                        time.time() - start))


if __name__ == "__main__":
  app.run(main)