python source code of prepare

"""
Turn a merged corpus into tfrecord files.

NOTE: You will want to do this using several processes. I did this on an AWS machine with 72 CPUs using GNU parallel
as that's where I had the deduplicated RealNews dataset.
"""
import argparse
import ujson as json
# from sample.encoder import get_encoder, tokenize_for_grover_training, detokenize, sliding_window, create_int_feature
import random
import tensorflow.compat.v1 as tf
import collections
import os
from tempfile import TemporaryDirectory

from tokenization import tokenization

parser = argparse.ArgumentParser(description='SCRAPE!')
parser.add_argument(
    '-fold',
    dest='fold',
    default=0,
    type=int,
    help='which fold we are on'
)
parser.add_argument(
    '-num_folds',
    dest='num_folds',
    default=1,
    type=int,
    help='Number of folds (corresponding to both the number of training files and the number of testing files)',
)
parser.add_argument(
    '-seed',
    dest='seed',
    default=1337,
    type=int,
    help='which seed to use'
)
parser.add_argument(
    '-base_fn',
    dest='base_fn',
    default='news2016zh_',
    type=str,
    help='We will output files that are like {base_fn}_{n}.tfrecord for n in 0, ..., 1023'
)

parser.add_argument(
    '-input_fn',
    dest='input_fn',
    default='realnews.jsonl',
    type=str,
    help='Base filename to use. THIS MUST BE A LOCAL FILE.'
)
parser.add_argument(
    '-max_seq_length',
    dest='max_seq_length',
    default=1025,
    type=int,
    help='Max sequence length',
)


args = parser.parse_args()
random.seed(args.seed + args.fold)

#encoder = get_encoder()
tokenizer = tokenization.FullTokenizer(
    vocab_file="bert-base-chinese-vocab.txt", do_lower_case=True)


class TFRecordWriter(object):
    def __init__(self, fn):
        self.fn = fn
        if fn.startswith('gs://'):
            from google.cloud import storage
            self.s3client = None
            self.gclient = storage.Client()
            self.storage_dir = TemporaryDirectory()
            self.writer = tf.python_io.TFRecordWriter(
                os.path.join(self.storage_dir.name, 'temp.tfrecord'))
            self.bucket_name, self.file_name = self.fn.split(
                'gs://', 1)[1].split('/', 1)

        else:
            self.s3client = None
            self.gclient = None
            self.bucket_name = None
            self.file_name = None
            self.storage_dir = None
            self.writer = tf.python_io.TFRecordWriter(fn)

    def write(self, x):
        self.writer.write(x)

    def close(self):
        self.writer.close()

        if self.gclient is not None:
            bucket = self.gclient.get_bucket(self.bucket_name)
            blob = bucket.blob(self.file_name)
            blob.upload_from_filename(os.path.join(
                self.storage_dir.name, 'temp.tfrecord'))
            self.storage_dir.cleanup()

    def __enter__(self):
        # Called when entering "with" context.
        return self

    def __exit__(self, *_):
        # Called when exiting "with" context.
        # Upload shit
        print("CALLING CLOSE")
        self.close()


def article_iterator(tokenizer):
    """ Iterate through the provided filename + tokenize"""
    assert os.path.exists(args.input_fn)
    for (dirpath, dirnames, filenames) in os.walk(args.input_fn):
        for filename in filenames:
            with open(os.path.join(dirpath, filename), 'r') as f:
                for l_no, l in enumerate(f):
                    if l_no % args.num_folds == args.fold:
                        article = json.loads(l)

                        line = tokenization.convert_to_unicode(
                            article['text'])  # for news2016zh text body
                        tokens = tokenizer.tokenize(line)
                        input_ids = tokenizer.convert_tokens_to_ids(tokens)

                        article['input_ids'] = input_ids

                        article['inst_index'] = (l_no // args.num_folds)
                        if article['inst_index'] < 100:
                            print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'],
                                                                            tokens,
                                                                            input_ids
                                                                            ), flush=True)
                        if len(article['input_ids']) <= 64:  # min size of article
                            continue
                        yield article


def create_int_feature(values):
    feature = tf.train.Feature(
        int64_list=tf.train.Int64List(value=list(values)))
    return feature


def buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=1025):
    """ We apply a sliding window to fix long sequences, and use a buffer that combines short sequences."""
    for article in article_iterator(tokenizer):
        if len(article['input_ids']) >= final_desired_size:
            article['input_ids'] = article['input_ids'][0:final_desired_size-1]
        while len(article['input_ids']) < final_desired_size:
            article['input_ids'].append(0)
        yield article


# OK now write the tfrecord file
total_written = 0
train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
with TFRecordWriter(train_file) as train_writer:
    for article in buffered_and_sliding_window_article_iterator(tokenizer,
                                                                final_desired_size=args.max_seq_length + 1):
        writer2use = train_writer
        assert len(article['input_ids']) == (args.max_seq_length + 1)

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(article['input_ids'])
        tf_example = tf.train.Example(
            features=tf.train.Features(feature=features))

        writer2use.write(tf_example.SerializeToString())
        total_written += 1

        # DEBUG
        if article['inst_index'] < 5:
            print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(article['inst_index'],
                                                                           tokenizer.convert_ids_to_tokens(
                                                                               article['input_ids']),
                                                                           article['input_ids']
                                                                           ), flush=True)
        if article['inst_index'] % 1000 == 0:
            print("{} articles, {} written".format(
                article['inst_index'], total_written), flush=True)
print("DONE UPLOADING", flush=True)