Python bert.tokenization.FullTokenizer() Examples

The following are 30 code examples of bert.tokenization.FullTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bert.tokenization , or try the search function .
Example #1
Source File: tokenization_test.py    From QGforQA with MIT License 6 votes vote down vote up
def test_full_tokenizer(self):
    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing", ","
    ]
    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
      if six.PY2:
        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
      else:
        vocab_writer.write("".join(
            [x + "\n" for x in vocab_tokens]).encode("utf-8"))

      vocab_file = vocab_writer.name

    tokenizer = tokenization.FullTokenizer(vocab_file)
    os.unlink(vocab_file)

    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

    self.assertAllEqual(
        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 
Example #2
Source File: server.py    From Bert-TextClassification with MIT License 6 votes vote down vote up
def __init__(self, id, args, worker_address, sink_address):
        super().__init__()
        self.model_dir = args.model_dir
        self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
        self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
        self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
        self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
        self.max_seq_len = args.max_seq_len
        self.worker_id = id
        self.daemon = True
        self.model_fn = model_fn_builder(
            bert_config=modeling.BertConfig.from_json_file(self.config_fp),
            init_checkpoint=self.checkpoint_fp,
            pooling_strategy=args.pooling_strategy,
            pooling_layer=args.pooling_layer
        )
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
        self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config))
        self.exit_flag = multiprocessing.Event()
        self.logger = set_logger('WORKER-%d' % self.worker_id)
        self.worker_address = worker_address
        self.sink_address = sink_address 
Example #3
Source File: tokenization_test.py    From Bert-TextClassification with MIT License 6 votes vote down vote up
def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 
Example #4
Source File: bert_example.py    From lasertagger with Apache License 2.0 6 votes vote down vote up
def __init__(self, label_map, vocab_file,
               max_seq_length, do_lower_case,
               converter):
    """Initializes an instance of BertExampleBuilder.

    Args:
      label_map: Mapping from tags to tag IDs.
      vocab_file: Path to BERT vocabulary file.
      max_seq_length: Maximum sequence length.
      do_lower_case: Whether to lower case the input text. Should be True for
        uncased models and False for cased models.
      converter: Converter from text targets to tags.
    """
    self._label_map = label_map
    self._tokenizer = tokenization.FullTokenizer(vocab_file,
                                                 do_lower_case=do_lower_case)
    self._max_seq_length = max_seq_length
    self._converter = converter
    self._pad_id = self._get_pad_id()
    self._keep_tag_id = self._label_map['KEEP'] 
Example #5
Source File: process_PeerRead_abstracts.py    From causal-text-embeddings with MIT License 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
    parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
    parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/proc')
    parser.add_argument('--out-file', type=str, default='arxiv-all.tf_record')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    parser.add_argument('--max-abs-len', type=int, default=250)
    parser.add_argument('--venue', type=int, default=0)
    parser.add_argument('--year', type=int, default=2017)


    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    clean_PeerRead_dataset(args.review_json_dir, args.parsedpdf_json_dir,
                           args.venue, args.year,
                           args.out_dir, args.out_file,
                           args.max_abs_len, tokenizer, is_arxiv=True) 
Example #6
Source File: process_reddit.py    From causal-text-embeddings with MIT License 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir', type=str, default=None)
    parser.add_argument('--out-dir', type=str, default='../dat/reddit')
    parser.add_argument('--out-file', type=str, default='proc.tf_record')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    parser.add_argument('--max-abs-len', type=int, default=128)
    parser.add_argument('--subsample', type=int, default=0)
    parser.add_argument('--use-latest-reddit', type=bool, default=True)

    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    process_reddit_dataset(args.data_dir, args.out_dir, args.out_file,
                           args.max_abs_len, tokenizer, args.subsample, args.use_latest_reddit) 
Example #7
Source File: train_decoder_layer.py    From sqlova with Apache License 2.0 6 votes vote down vote up
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):


    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')



    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config 
Example #8
Source File: train.py    From sqlova with Apache License 2.0 6 votes vote down vote up
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config 
Example #9
Source File: train_shallow_layer.py    From sqlova with Apache License 2.0 6 votes vote down vote up
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):


    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')



    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config 
Example #10
Source File: sent_eval.py    From embedding with MIT License 6 votes vote down vote up
def __init__(self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
                 bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
                 vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
                 max_seq_length=32, dimension=768, num_labels=2, use_notebook=False):

        super().__init__("bert", dimension, use_notebook)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(config,
                                                                                                    max_seq_length,
                                                                                                    1.0,
                                                                                                    num_labels,
                                                                                                    tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path) 
Example #11
Source File: preprocess_qa.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  examples = read_examples(input_file=FLAGS.input_file)

  # Pre-shuffle the input to avoid having to make a very large shuffle
  # buffer in in the `input_fn`.
  rng = random.Random(12345)
  rng.shuffle(examples)

  # We write to a temporary file to avoid storing very large
  # constant tensors in memory.
  writer = FeatureWriter(filename=FLAGS.output_file)
  features = []

  def append_feature(feature):
    features.append(feature)
    writer.process_feature(feature)

  convert_examples_to_features(
      examples=examples,
      tokenizer=tokenizer,
      max_doc_length=FLAGS.max_seq_length,
      doc_stride=FLAGS.doc_stride,
      max_query_length=FLAGS.max_query_length,
      output_fn=append_feature)
  writer.close()
  tf.logging.info("%d original examples read.", len(examples))
  tf.logging.info("%d split records written.", writer.num_features)

  if FLAGS.feature_file is not None:
    json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
              tf.gfile.Open(FLAGS.feature_file, "w")) 
Example #12
Source File: run_classifier_with_tfhub.py    From QGforQA with MIT License 5 votes vote down vote up
def create_tokenizer_from_hub_module(bert_hub_module_handle):
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(bert_hub_module_handle)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
  return tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case) 
Example #13
Source File: raw_books_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def preproc_doc(document):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a CCNews article (ie. a list of sentences)

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # document = [
  #     tokenization.convert_to_unicode(
  #         unidecode.unidecode(line.decode("utf-8"))) for line in document
  # ]

  sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
  sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
  if len(sent_tokens) < 8:
    return []

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
                                              FLAGS.max_sent_length,
                                              FLAGS.max_para_length)

  # Serialize TFExample for writing to file.
  tf_examples = [tf_example.SerializeToString()]

  return tf_examples 
Example #14
Source File: ccnews_preproc_pipeline.py    From language with Apache License 2.0 5 votes vote down vote up
def preproc_doc(document):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a CCNews article (ie. a list of sentences)

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  document = [
      tokenization.convert_to_unicode(
          unidecode.unidecode(line.decode("utf-8"))) for line in document
  ]

  sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
  sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
  if len(sent_tokens) < 8:
    return []

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
                                              FLAGS.max_sent_length,
                                              FLAGS.max_para_length)

  # Serialize TFExample for writing to file.
  tf_examples = [tf_example.SerializeToString()]

  return tf_examples 
Example #15
Source File: convert_to_examples.py    From language with Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary)

  print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' +
        FLAGS.input_filepath)

  # The debugging file saves all of the processed SQL queries.
  debugging_file = gfile.Open(
      os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]),
                   FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'),
      'w')

  # The output file will save a sequence of string-serialized JSON objects, one
  # line per object.
  output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w')

  if FLAGS.dataset_name.lower() == 'spider':
    num_examples_created, num_examples_failed = process_spider(
        output_file, debugging_file, tokenizer)
  elif FLAGS.dataset_name.lower() == 'wikisql':
    num_examples_created, num_examples_failed = process_wikisql(
        output_file, debugging_file, tokenizer)
  else:
    num_examples_created, num_examples_failed = process_michigan_datasets(
        output_file, debugging_file, tokenizer)

  print('Wrote %s examples, could not annotate %s examples.' %
        (num_examples_created, num_examples_failed))
  debugging_file.write('Wrote %s examples, could not annotate %s examples.' %
                       (num_examples_created, num_examples_failed))
  debugging_file.close()
  output_file.close() 
Example #16
Source File: create_tfrecords.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  examples = read_examples(input_file=FLAGS.input_file)

  # Pre-shuffle the input to avoid having to make a very large shuffle
  # buffer in in the `input_fn`.
  rng = random.Random(12345)
  rng.shuffle(examples)

  # We write to a temporary file to avoid storing very large
  # constant tensors in memory.
  writer = FeatureWriter(filename=FLAGS.output_file)
  features = []

  def append_feature(feature):
    features.append(feature)
    writer.process_feature(feature)

  convert_examples_to_features(
      examples=examples,
      tokenizer=tokenizer,
      max_doc_length=FLAGS.max_seq_length,
      doc_stride=FLAGS.doc_stride,
      max_query_length=FLAGS.max_query_length,
      output_fn=append_feature)
  writer.close()
  tf.logging.info("%d original examples read.", len(examples))
  tf.logging.info("%d split records written.", writer.num_features)

  if FLAGS.feature_file is not None:
    json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
              tf.gfile.Open(FLAGS.feature_file, "w")) 
Example #17
Source File: create_pretraining_data.py    From QGforQA with MIT License 5 votes vote down vote up
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files) 
Example #18
Source File: run_nq.py    From language with Apache License 2.0 5 votes vote down vote up
def __init__(self, is_training):
    self.is_training = is_training
    self.tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 
Example #19
Source File: preprocess_bert_dataset.py    From delta with Apache License 2.0 5 votes vote down vote up
def bert_preprocess(filename, vocab):
  tokenizer = tokenization.FullTokenizer(
    vocab_file=vocab, do_lower_case=False)
  new_filename = filename + ".bert"
  f1 = open(new_filename, 'w')
  per_count = 0
  with open(filename, "r") as f:
    lines = f.readlines()
    for line in lines:
      str1 = line.split("\t")[1]
      label1 = line.split("\t")[0]
      new_label_list = []
      old_label_list = label1.split(' ')
      word_list = str1.split(' ')
      tokens = []
      tokens.append('[CLS]')
      new_label_list.append('O')
      per_count = 0
      for i, (w, t) in enumerate(zip(word_list, old_label_list)):
        token = tokenizer.tokenize(w)
        tokens.extend(token)
        for i, _ in enumerate(token):
          if i == 0:
            new_label_list.append(t)
          else:
            new_label_list.append("X")
      tokens.append('[SEG]')
      new_label_list.append('O')
      assert len(tokens) == len(new_label_list)
      rm_new_label_list = [i for i in new_label_list if i != 'O' and i != 'X']
      rm_old_label_list = [i for i in old_label_list if i != 'O' and i != 'X']
      assert len(rm_new_label_list) == len(rm_old_label_list)
      f1.write(" ".join(new_label_list) + '\t' +
               " ".join(tokens) + '\n') 
Example #20
Source File: bert_sim.py    From chinese-bert-similarity with MIT License 5 votes vote down vote up
def __init__(self, gpu_no, log_dir, bert_sim_dir, verbose=False):
        self.bert_sim_dir = bert_sim_dir
        self.logger = set_logger(colored('BS', 'cyan'), log_dir, verbose)

        self.tf = import_tf(gpu_no, verbose)

        # add tokenizer
        from bert import tokenization
        self.tokenizer = tokenization.FullTokenizer(os.path.join(bert_sim_dir, 'vocab.txt'))
        # add placeholder
        self.input_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_ids')
        self.input_mask = self.tf.placeholder(self.tf.int32, (None, 45), 'input_mask')
        self.input_type_ids = self.tf.placeholder(self.tf.int32, (None, 45), 'input_type_ids')
        # init graph
        self._init_graph() 
Example #21
Source File: bert_predict.py    From FoolNLTK with Apache License 2.0 5 votes vote down vote up
def __init__(self, export_model_path, vocab_file):
    self.export_model_path = export_model_path
    self.vocab_file = vocab_file
    self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
    self.predict_fn = predictor.from_saved_model(self.export_model_path)
    self.label_map = pickle.load(open(LABEL_FILE, 'rb'))
    self.id_to_label = {v: k for k, v in self.label_map.items()} 
Example #22
Source File: minimize.py    From coref with Apache License 2.0 5 votes vote down vote up
def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case):
  # do_lower_case = True if 'chinese' in vocab_file else False
  tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)
  minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) 
Example #23
Source File: overlap_minimize.py    From coref with Apache License 2.0 5 votes vote down vote up
def minimize_language(language, labels, stats, vocab_file, seg_len, input_dir, output_dir, do_lower_case):
  tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)
  minimize_partition("dev", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("train", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir)
  minimize_partition("test", language, "v4_gold_conll", labels, stats, tokenizer, seg_len, input_dir, output_dir) 
Example #24
Source File: run_similarity.py    From KBQA-BERT with MIT License 5 votes vote down vote up
def __init__(self, batch_size=args.batch_size):
        self.mode = None
        self.max_seq_length = args.max_seq_len
        self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
        self.batch_size = batch_size
        self.estimator = None
        self.processor = SimProcessor()
        tf.logging.set_verbosity(tf.logging.INFO) 
Example #25
Source File: strings_utils.py    From ludwig with Apache License 2.0 5 votes vote down vote up
def __init__(self, vocab_file=None, **kwargs):
        super().__init__()
        if vocab_file is None:
            raise ValueError(
                'Vocabulary file is required to initialize BERT tokenizer'
            )

        try:
            from bert.tokenization import FullTokenizer
        except ImportError:
            raise ValueError(
                "Please install bert-tensorflow: pip install bert-tensorflow"
            )

        self.tokenizer = FullTokenizer(vocab_file) 
Example #26
Source File: create_pretraining_data.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)

  rng = random.Random(FLAGS.random_seed)
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files) 
Example #27
Source File: array_from_dataset.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
                            base_output_dir='../dat/sim/peerread_buzzytitle_based/'):

    labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)

    num_splits = 10
    dev_splits = [0]
    test_splits = [0]

    # data_file = '../dat/reddit/proc.tf_record'
    # vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

    input_dataset_from_filenames = make_input_fn_from_file(data_file,
                                                           250,
                                                           num_splits,
                                                           dev_splits,
                                                           test_splits,
                                                           tokenizer,
                                                           is_training=False,
                                                           filter_test=False,
                                                           shuffle_buffer_size=25000,
                                                           seed=seed,
                                                           labeler=labeler)

    output_df = dataset_fn_to_df(input_dataset_from_filenames)
    output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})

    output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))

    output_df.to_csv(output_path, '\t') 
Example #28
Source File: clean_PeerRead.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
    parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
    args = parser.parse_args()

    datasets_dir = args.datasets_dir
    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    def proc_dataset(dataset):
        all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
        review_json_dir = os.path.join(all_dir, 'reviews')
        parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')

        venue = dataset_venues[dataset]
        year = dataset_years[dataset]

        out_dir = os.path.join(datasets_dir, 'proc')
        out_file = dataset + '.tf_record'
        max_abs_len = 250

        clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
                               tokenizer)

    # pool = mp.Pool(4)
    # pool.map(proc_dataset, dataset_names)

    for dataset in dataset_names:
        proc_dataset(dataset) 
Example #29
Source File: extra_vocab.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--review-json-dir', type=str, default=None)
    parser.add_argument('--vocab-file', type=str, default=None)

    args = parser.parse_args()

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=True)

    review_json_dir = args.review_json_dir

    print('Reading reviews from...', review_json_dir)
    paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))

    paper_json_filename = paper_json_filenames[0]
    with io.open(paper_json_filename) as json_file:
        loaded = json.load(json_file)
    abstract = loaded['abstract']
    print(abstract)
    tokens = tokenizer.tokenize(abstract)
    print(tokens)
    print(tokenizer.convert_tokens_to_ids(tokens))

    # for idx, paper_json_filename in enumerate(paper_json_filenames):
    #     with io.open(paper_json_filename) as json_file:
    #         loaded = json.load(json_file)
    #
    #     print(loaded['abstract']) 
Example #30
Source File: tune_utils.py    From embedding with MIT License 5 votes vote down vote up
def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
                 test_corpus_fname=None, tokenized_test_corpus_fname=None,
                 model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
                 batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
                 sp_model_path=None):
        # configurations
        tf.logging.set_verbosity(tf.logging.INFO)
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)
        # define tokenizer
        if self.model_name == "bert":
            self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        elif self.model_name == "xlnet":
            sp = spm.SentencePieceProcessor()
            sp.Load(sp_model_path)
            self.tokenizer = sp
        else:
            self.tokenizer = get_tokenizer("mecab")
        # load or tokenize corpus
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)