python source code of dataset

"""

Parses PeerRead data into a Bert-based model compatible format, and stores as tfrecord

See dataset.py for the corresponding code to read this data

"""
import argparse
import numpy as np
import pandas as pd
from scipy.special import logit, expit

import tensorflow as tf
try:
    import mkl_random as random
except ImportError:
    import numpy.random as random

import bert.tokenization as tokenization
from PeerRead.dataset.sentence_masking import create_masked_lm_predictions

# hardcoded because protobuff is not self describing for some bizarre reason
all_context_features = \
    {'accepted': tf.int64,
     'most_recent_reference_year': tf.int64,
     'num_recent_references': tf.int64,
     'num_references': tf.int64,
     'num_refmentions': tf.int64,
     # 'avg_length_reference_mention_contexts': tf.float32,
     'abstract_contains_deep': tf.int64,
     'abstract_contains_neural': tf.int64,
     'abstract_contains_embedding': tf.int64,
     'abstract_contains_outperform': tf.int64,
     'abstract_contains_novel': tf.int64,
     'abstract_contains_state-of-the-art': tf.int64,
     "title_contains_deep": tf.int64,
     "title_contains_neural": tf.int64,
     "title_contains_embedding": tf.int64,
     "title_contains_gan": tf.int64,
     'num_ref_to_figures': tf.int64,
     'num_ref_to_tables': tf.int64,
     'num_ref_to_sections': tf.int64,
     'num_uniq_words': tf.int64,
     'num_sections': tf.int64,
     # 'avg_sentence_length': tf.float32,
     'contains_appendix': tf.int64,
     'title_length': tf.int64,
     'num_authors': tf.int64,
     'num_ref_to_equations': tf.int64,
     'num_ref_to_theorems': tf.int64,
     'id': tf.int64,
     'year': tf.int64,
     'venue': tf.int64,
     'arxiv': tf.int64,
     'many_split': tf.int64}


def compose(*fns):
    """ Composes the given functions in reverse order.

    Parameters
    ----------
    fns: the functions to compose

    Returns
    -------
    comp: a function that represents the composition of the given functions.
    """
    import functools

    def _apply(x, f):
        if isinstance(x, tuple):
            return f(*x)
        else:
            return f(x)

    def comp(*args):
        return functools.reduce(_apply, fns, args)

    return comp


def make_parser(abs_seq_len=250):
    context_features = {k: tf.FixedLenFeature([], dtype=v) for k, v in all_context_features.items()}

    abstract_features = {
        "token_ids": tf.FixedLenFeature([abs_seq_len], tf.int64),
        "token_mask": tf.FixedLenFeature([abs_seq_len], tf.int64),
        # "segment_ids": tf.FixedLenFeature([abs_seq_len], tf.int64),
    }

    _name_to_features = {**context_features, **abstract_features}

    def parser(record):
        tf_example = tf.parse_single_example(
            record,
            features=_name_to_features
        )

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(tf_example.keys()):
            t = tf_example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            tf_example[name] = t

        return tf_example

    return parser


def make_input_id_masker(tokenizer, seed):
    # (One of) Bert's unsupervised objectives is to mask some fraction of the input words and predict the masked words

    def masker(data):
        token_ids = data['token_ids']
        maybe_masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = create_masked_lm_predictions(
            token_ids,
            # pre-training defaults from Bert docs
            masked_lm_prob=0.15,
            max_predictions_per_seq=20,
            vocab=tokenizer.vocab,
            seed=seed)
        return {
            **data,
            'maybe_masked_input_ids': maybe_masked_input_ids,
            'masked_lm_positions': masked_lm_positions,
            'masked_lm_ids': masked_lm_ids,
            'masked_lm_weights': masked_lm_weights
        }

    return masker


def make_extra_feature_cleaning():
    def extra_feature_cleaning(data):
        data['num_authors'] = tf.minimum(data['num_authors'], 6)-1
        data['year'] = data['year']-2007

        # some extras
        equation_referenced = tf.minimum(data['num_ref_to_equations'], 1)
        theorem_referenced = tf.minimum(data['num_ref_to_theorems'], 1)

        # buzzy title
        any_buzz = data["title_contains_deep"] + data["title_contains_neural"] + \
                   data["title_contains_embedding"] + data["title_contains_gan"]
        buzzy_title = tf.cast(tf.not_equal(any_buzz, 0), tf.int32)

        return {**data,
                'equation_referenced': equation_referenced,
                'theorem_referenced': theorem_referenced,
                'buzzy_title': buzzy_title,
                'index': data['id']}
    return extra_feature_cleaning


def make_label():
    """
    Do something slightly nuts for testing purposes
    :return:
    """
    def labeler(data):
        return {**data, 'label_ids': data['accepted']}

    # def wacky_labeler(data):
    #     label_ids = tf.greater_equal(data['num_authors'], 4)
    #     label_ids = tf.cast(label_ids, tf.int32)
    #     return {**data, 'label_ids': label_ids}

    return labeler


def outcome_sim(beta0, beta1, gamma, treatment, confounding, noise, setting="simple"):
    if setting == "simple":
        y0 = beta1 * confounding
        y1 = beta0 + y0

        simulated_score = (1. - treatment) * y0 + treatment * y1 + gamma * noise

    elif setting == "multiplicative":
        y0 = beta1 * confounding
        y1 = beta0 * y0

        simulated_score = (1. - treatment) * y0 + treatment * y1 + gamma * noise

    elif setting == "interaction":
        # required to distinguish ATT and ATE
        y0 = beta1 * confounding
        y1 = y0 + beta0 * tf.math.square(confounding)

        simulated_score = (1. - treatment) * y0 + treatment * y1 + gamma * noise

    else:
        raise Exception('setting argument to make_simulated_labeler not recognized')

    return simulated_score, y0, y1


def _make_hidden_float_constant(value, name):
    # hack to prevent tensorflow from writing the constant to the graphdef
    return tf.py_func(
        lambda: value,
        [], tf.float32, stateful=False,
        name=name)


def make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting="simple", seed=0):
    # hardcode probability of theorem given buzzy / not_buzzy
    theorem_given_buzzy_probs = np.array([0.27, 0.07], dtype=np.float32)

    np.random.seed(seed)
    all_noise = np.array(random.normal(0, 1, 12000), dtype=np.float32)
    all_threshholds = np.array(random.uniform(0, 1, 12000), dtype=np.float32)

    def labeler(data):
        buzzy = data['buzzy_title']
        index = data['index']
        treatment = data['theorem_referenced']
        treatment = tf.cast(treatment, tf.float32)
        confounding = 3.0*(tf.gather(theorem_given_buzzy_probs, buzzy) - 0.25)

        noise = tf.gather(all_noise, index)

        y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, treatment, confounding, noise, setting=setting)
        simulated_prob = tf.nn.sigmoid(y)
        y0 = tf.nn.sigmoid(y0)
        y1 = tf.nn.sigmoid(y1)
        threshold = tf.gather(all_threshholds, index)
        simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32)

        return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1}

    return labeler


def make_propensity_based_simulated_labeler(treat_strength, con_strength, noise_level,
                                            base_propensity_scores, example_indices, exogeneous_con=0.,
                                            setting="simple", seed=42):
    np.random.seed(seed)
    all_noise = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32)
    all_threshholds = np.array(random.uniform(0, 1, base_propensity_scores.shape[0]), dtype=np.float32)

    extra_confounding = random.normal(0, 1, base_propensity_scores.shape[0]).astype(np.float32)

    all_propensity_scores = expit((1.-exogeneous_con)*logit(base_propensity_scores) + exogeneous_con * extra_confounding).astype(np.float32)
    all_treatments = random.binomial(1, all_propensity_scores).astype(np.int32)

    # indices in dataset refer to locations in entire corpus,
    # but propensity scores will typically only inlcude a subset of the examples
    reindex_hack = np.zeros(12000, dtype=np.int32)
    reindex_hack[example_indices] = np.arange(example_indices.shape[0], dtype=np.int32)

    def labeler(data):
        index = data['index']
        index_hack = tf.gather(reindex_hack, index)
        treatment = tf.gather(all_treatments, index_hack)
        confounding = 3.0 * (tf.gather(all_propensity_scores, index_hack) - 0.25)
        noise = tf.gather(all_noise, index_hack)

        y, y0, y1 = outcome_sim(treat_strength, con_strength, noise_level, tf.cast(treatment, tf.float32), confounding, noise, setting=setting)
        simulated_prob = tf.nn.sigmoid(y)
        y0 = tf.nn.sigmoid(y0)
        y1 = tf.nn.sigmoid(y1)
        threshold = tf.gather(all_threshholds, index)
        simulated_outcome = tf.cast(tf.greater(simulated_prob, threshold), tf.int32)

        return {**data, 'outcome': simulated_outcome, 'y0': y0, 'y1': y1, 'treatment': treatment}

    return labeler


def make_split_document_labels(num_splits, dev_splits, test_splits):
    """
    Adapts tensorflow dataset to produce additional elements that indicate whether each datapoint is in train, dev,
    or test

    Particularly, splits the data into num_split folds, and censors the censored_split fold

    Parameters
    ----------
    num_splits integer in [0,100)
    dev_splits list of integers in [0,num_splits)
    test_splits list of integers in [0, num_splits)

    Returns
    -------
    fn: A function that can be used to map a dataset to censor some of the document labels.
    """
    def _tf_in1d(a,b):
        """
        Tensorflow equivalent of np.in1d(a,b)
        """
        a = tf.expand_dims(a, 0)
        b = tf.expand_dims(b, 1)
        return tf.reduce_any(tf.equal(a, b), 1)

    def _tf_scalar_a_in1d_b(a, b):
        """
        Tensorflow equivalent of np.in1d(a,b)
        """
        return tf.reduce_any(tf.equal(a, b))

    def fn(data):
        many_split = data['many_split']
        reduced_split = tf.floormod(many_split, num_splits)  # reduce the many splits to just num_splits

        in_dev = _tf_scalar_a_in1d_b(reduced_split, dev_splits)
        in_test = _tf_scalar_a_in1d_b(reduced_split, test_splits)
        in_train = tf.logical_not(tf.logical_or(in_dev, in_test))

        # in_dev = _tf_in1d(reduced_splits, dev_splits)
        # in_test = _tf_in1d(reduced_splits, test_splits)
        # in_train = tf.logical_not(tf.logical_or(in_dev, in_test))

        # code expects floats
        in_dev = tf.cast(in_dev, tf.float32)
        in_test = tf.cast(in_test, tf.float32)
        in_train = tf.cast(in_train, tf.float32)

        return {**data, 'in_dev': in_dev, 'in_test': in_test, 'in_train': in_train}

    return fn


def dataset_processing(dataset, parser, masker, labeler, is_training, num_splits, dev_splits, test_splits, batch_size,
                       filter_test=False,
                       shuffle_buffer_size=100):
    """

    Parameters
    ----------
    dataset  tf.data dataset
    parser function, read the examples, should be based on tf.parse_single_example
    masker function, should provide Bert style masking
    labeler function, produces labels
    is_training
    num_splits
    censored_split
    batch_size
    filter_test restricts to only examples where in_test=1
    shuffle_buffer_size

    Returns
    -------

    """

    if is_training:
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    data_processing = compose(parser,  # parse from tf_record
                              labeler,  # add a label (unused downstream at time of comment)
                              make_split_document_labels(num_splits, dev_splits, test_splits),  # censor some labels
                              masker)  # Bert style token masking for unsupervised training

    dataset = dataset.map(data_processing, 4)

    if filter_test:
        def filter_test_fn(data):
            return tf.equal(data['in_test'], 1)

        dataset = dataset.filter(filter_test_fn)

    if is_training:
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
    else:
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=False)

    return dataset


def make_input_fn_from_file(input_files_or_glob, seq_length,
                            num_splits, dev_splits, test_splits,
                            tokenizer, is_training,
                            filter_test=False,
                            shuffle_buffer_size=100, seed=0, labeler=None):

    input_files = []
    for input_pattern in input_files_or_glob.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    if labeler is None:
        labeler = make_label()

    def input_fn(params):
        batch_size = params["batch_size"]

        if is_training:
            dataset = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            dataset = dataset.repeat()
            dataset = dataset.shuffle(buffer_size=len(input_files))
            cycle_length = min(4, len(input_files))

        else:
            dataset = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            cycle_length = 1  # go through the datasets in a deterministic order

        # make the record parsing ops
        max_abstract_len = seq_length

        parser = make_parser(max_abstract_len)  # parse the tf_record
        parser = compose(parser, make_extra_feature_cleaning())
        masker = make_input_id_masker(tokenizer, seed)  # produce masked subsets for unsupervised training

        # for use with interleave
        def _dataset_processing(input):
            input_dataset = tf.data.TFRecordDataset(input)
            processed_dataset = dataset_processing(input_dataset,
                                                   parser, masker, labeler,
                                                   is_training,
                                                   num_splits, dev_splits, test_splits,
                                                   batch_size, filter_test, shuffle_buffer_size)
            return processed_dataset

        dataset = dataset.apply(
            tf.data.experimental.parallel_interleave(
                _dataset_processing,
                sloppy=is_training,
                cycle_length=cycle_length))

        return dataset

    return input_fn


def main():
    tf.enable_eager_execution()

    parser = argparse.ArgumentParser()
    parser.add_argument('--shuffle_buffer_size', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--max_abs_len', type=int, default=250)

    args = parser.parse_args()

    # for easy debugging
    # filename = "../../dat/PeerRead/proc/acl_2017.tf_record"
    # filename = glob.glob('../dat/PeerRead/proc/*.tf_record')
    filename = '../dat/PeerRead/proc/arxiv-all.tf_record'

    vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
    seed = 0
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

    num_splits = 10
    # dev_splits = [0]
    # test_splits = [0]
    dev_splits = []
    test_splits = [1,2]


    labeler = make_buzzy_based_simulated_labeler(0.5, 5.0, 0.0, 'simple',
                                                 seed=0)

    base_propensities_path = 'logs/peerread/buzzy_based_sim/modesimple/beta00.25.beta11.0.gamma0.0/split0/predict/test_results.tsv'
    output = pd.read_csv(base_propensities_path, '\t')
    base_propensity_scores = np.concatenate([output['treatment_probability'].values, np.zeros(8)])
    example_indices = output['index'].values

    labeler = make_propensity_based_simulated_labeler(treat_strength=0.25,
                                                      con_strength=1.0,
                                                      noise_level=0.0,
                                                      base_propensity_scores=base_propensity_scores,
                                                      example_indices=example_indices,
                                                      exogeneous_con=0.,
                                                      setting="simple",
                                                      seed=0)
    # labeler = None

    input_dataset_from_filenames = make_input_fn_from_file(filename,
                                                           250,
                                                           num_splits,
                                                           dev_splits,
                                                           test_splits,
                                                           tokenizer,
                                                           is_training=True,
                                                           filter_test=False,
                                                           shuffle_buffer_size=25000,
                                                           labeler=labeler,
                                                           seed=0)
    params = {'batch_size': 4096}
    input_dataset = input_dataset_from_filenames(params)

    # masker = make_input_id_masker(tokenizer, seed)
    # parser = make_parser(args.max_abs_len)
    # labeler = make_label()
    #
    # dataset = tf.data.TFRecordDataset(filename)

    # input_dataset = dataset_processing(dataset, parser, masker, labeler,
    #                                    is_training=True, num_examples=num_examples, split_indices=split_indices,
    #                                    batch_size=args.batch_size, shuffle_buffer_size=100)

    secs = []

    itr = input_dataset.make_one_shot_iterator()
    # print(itr.get_next()["token_ids"].name)
    # for i in range(1000):
    #     sample = itr.get_next()

    for i in range(10):
        sample = itr.get_next()
        print(np.max(sample['index']))
        print(np.min(sample['index']))
        # "title_contains_deep": tf.int64,
        # "title_contains_neural": tf.int64,
        # "title_contains_embedding": tf.int64,
        # "title_contains_gan": tf.int64,
        # print(sample['buzzy_title'])
        # venue = sample['venue']
        # arxiv = sample['arxiv']
        # print("venue: {}".format(venue))
        # print("arxiv: {}".format(arxiv))
        #
        # print("frac_arxiv: {}".format((np.equal(arxiv,0)).mean()))

        # print("year: {}".format(sample['year']))

        # thm_ref = sample['theorem_referenced'].numpy()
        # buzzy_title = sample['buzzy_title'].numpy()
        #
        # t_pr = thm_ref.mean()
        # b_pr = buzzy_title.mean()
        #
        # tb_pr = (thm_ref*buzzy_title).mean()
        # tnotb_pr = (thm_ref*(1.-buzzy_title)).mean()

        # print("t_pr: {}".format(t_pr))
        # print("b_pr: {}".format(b_pr))
        # print("th given buzzy: {}".format(tb_pr / b_pr))
        # print("th given not buzzy: {}".format(tnotb_pr / (1-b_pr)))

        # treatment = sample['theorem_referenced'].numpy()
        # treatment = sample['treatment']
        # print("treatment: {}".format(treatment.mean()))
        # outcome = sample['outcome'].numpy()
        # print("outcome: {}".format(outcome.mean()))
        #
        # print("outcome_st_treatment: {}".format((outcome*treatment).mean()/treatment.mean()))
        # print("outcome_st_not_treatment: {}".format((outcome*(1.-treatment)).mean()/(1.-treatment).mean()))
        #
        # # print("outcome: {}".format(tf.reduce_mean(tf.cast(sample['outcome'], tf.float32))))
        # print("y0: {}".format(sample['y0']))
        # print("y1: {}".format(sample['y1']))

if __name__ == "__main__":
    main()