Python bert.tokenization.printable_text() Examples

The following are 26 code examples of bert.tokenization.printable_text(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bert.tokenization , or try the search function .
Example #1
Source File: triviaqa_document_utils.py    From RE3QA with Apache License 2.0 6 votes vote down vote up
def __repr__(self):
        s = ""
        s += "document_id: %s" % (self.document_id)
        s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: %s ..." % (" ".join(self.doc_tokens[:20]))
        s += ", length of doc_tokens: %d" % (len(self.doc_tokens))
        if self.orig_answer_texts:
            s += ", orig_answer_texts: {}".format(self.orig_answer_texts)
        if self.start_positions and self.end_positions:
            s += ", start_positions: {}".format(self.start_positions)
            s += ", end_positions: {}".format(self.end_positions)
            s += ", token_answer: "
            for start, end in zip(self.start_positions, self.end_positions):
                s += "{}, ".format(" ".join(self.doc_tokens[start:(end+1)]))
        return s 
Example #2
Source File: run_bert_open_qa_eval.py    From XQA with MIT License 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "id: %s" % (self.qid)
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    s += ", answer_text: %s" % (self.orig_answer_text)
    return s 
Example #3
Source File: squad_document_utils.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "document_id: %s" % (self.document_id)
        s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens[:20]))
        s += ", length of doc_tokens: [%d]" % (len(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.end_position:
            s += ", end_position: %d" % (self.end_position)
        return s 
Example #4
Source File: squad_document_utils.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s 
Example #5
Source File: squad_open_utils.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += "doc_index: %d" % (self.doc_index)
        s += "para_index: %d" % (self.para_index)
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        if self.answer_texts is not None:
            s += ", answer_texts: ".format(self.answer_texts)
        return s 
Example #6
Source File: squad_utils.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s 
Example #7
Source File: run_squad.py    From MAX-Question-Answering with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        if self.start_position:
            s += ", is_impossible: %r" % (self.is_impossible)
        return s 
Example #8
Source File: utils.py    From SpanABSA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        # s += "example_id: %s" % (tokenization.printable_text(self.example_id))
        s += ", sent_tokens: [%s]" % (" ".join(self.sent_tokens))
        if self.term_texts:
            s += ", term_texts: {}".format(self.term_texts)
        # if self.start_positions:
        #     s += ", start_positions: {}".format(self.start_positions)
        # if self.end_positions:
        #     s += ", end_positions: {}".format(self.end_positions)
        if self.polarities:
            s += ", polarities: {}".format(self.polarities)
        return s 
Example #9
Source File: squad_utils.py    From SpanABSA with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s 
Example #10
Source File: run_dualencoder_lsf.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #11
Source File: run_dualencoder_qa.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #12
Source File: create_tfrecords.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #13
Source File: answer_extractor.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #14
Source File: run_squad.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #15
Source File: preprocessing_utils.py    From language with Apache License 2.0 5 votes vote down vote up
def __str__(self):
    s = ""
    for sent in self.tokens:
      s += "tokens: %s\n" % (" ".join(
          [tokenization.printable_text(x) for x in sent]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "\n"
    return s 
Example #16
Source File: preprocessing_utils.py    From language with Apache License 2.0 5 votes vote down vote up
def __str__(self):
    s = ""
    for sent in self.tokens[0]:
      s += "tokens: %s\n" % (" ".join(
          [tokenization.printable_text(x) for x in sent]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids[0]]))
    s += "\n"
    return s 
Example #17
Source File: run_squad_membership.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.label_id:
      s += ", membership label_id: %d" % (self.label_id)
    return s 
Example #18
Source File: run_squad.py    From language with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #19
Source File: create_pretraining_data.py    From QGforQA with MIT License 5 votes vote down vote up
def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s 
Example #20
Source File: test_squad.py    From QGforQA with MIT License 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_position: %d" % (self.start_position)
    if self.start_position:
      s += ", end_position: %d" % (self.end_position)
    if self.start_position:
      s += ", is_impossible: %r" % (self.is_impossible)
    return s 
Example #21
Source File: drop_utils.py    From MTMSN with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", \nquestion: %s" % (" ".join(self.question_tokens))
        s += ", \npassage: %s" % (" ".join(self.passage_tokens))
        if self.numbers_in_passage:
            s += ", \nnumbers_in_passage: {}".format(self.numbers_in_passage)
        if self.number_indices:
            s += ", \nnumber_indices: {}".format(self.number_indices)
        if self.answer_type:
            s += ", \nanswer_type: {}".format(self.answer_type)
        if self.number_of_answer:
            s += ", \nnumber_of_answer: {}".format(self.number_of_answer)
        if self.passage_spans:
            s += ", \npassage_spans: {}".format(self.passage_spans)
        if self.question_spans:
            s += ", \nquestion_spans: {}".format(self.question_spans)
        if self.add_sub_expressions:
            s += ", \nadd_sub_expressions: {}".format(self.add_sub_expressions)
        if self.counts:
            s += ", \ncounts: {}".format(self.counts)
        if self.negations:
            s += ", \nnegations: {}".format(self.negations)
        if self.answer_annotations:
            s += ", \nanswer_annotations: {}".format(self.answer_annotations)
        return s 
Example #22
Source File: squad_utils.py    From MTMSN with Apache License 2.0 5 votes vote down vote up
def __repr__(self):
        s = ""
        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
        s += ", question_text: %s" % (
            tokenization.printable_text(self.question_text))
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
        if self.start_position:
            s += ", end_position: %d" % (self.end_position)
        return s 
Example #23
Source File: create_pretraining_data.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s 
Example #24
Source File: run_bert_open_qa_train.py    From XQA with MIT License 5 votes vote down vote up
def __repr__(self):
    s = ""
    s += "id: %s" % (self.qid)
    s += ", question_text: %s" % (
        tokenization.printable_text(self.question_text))
    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
    if self.start_position:
      s += ", start_positions: %s" % (self.start_position)
    if self.start_position:
      s += ", end_positions: %s" % (self.end_position)
    return s 
Example #25
Source File: input_fns.py    From language with Apache License 2.0 4 votes vote down vote up
def convert_examples_to_features(examples, tokenizer, max_query_length,
                                 entity2id, output_fn):
  """Loads a data file into a list of `InputBatch`s."""
  for (example_index, example) in tqdm(enumerate(examples)):
    qry_input_ids, qry_input_mask, qry_tokens = get_tokens_and_mask(
        example.question_text, tokenizer, max_query_length)
    relation_input_ids, relation_input_mask = [], []
    if example.relations is not None:
      for relation in example.relations:
        rel_input_ids, rel_input_mask, _ = get_tokens_and_mask(
            relation, tokenizer, max_query_length)
        relation_input_ids.append(rel_input_ids)
        relation_input_mask.append(rel_input_mask)
    if example_index < 20:
      tf.logging.info("*** Example ***")
      tf.logging.info("unique_id: %s", example.qas_id)
      tf.logging.info(
          "qry_tokens: %s",
          " ".join([tokenization.printable_text(x) for x in qry_tokens]))
      tf.logging.info("qry_input_ids: %s",
                      " ".join([str(x) for x in qry_input_ids]))
      tf.logging.info("qry_input_mask: %s",
                      " ".join([str(x) for x in qry_input_mask]))
      for ii in range(len(relation_input_ids)):
        tf.logging.info("relation_input_ids_%d: %s", ii,
                        " ".join([str(x) for x in relation_input_ids[ii]]))
        tf.logging.info("relation_input_mask_%d: %s", ii,
                        " ".join([str(x) for x in relation_input_mask[ii]]))
      tf.logging.info("qry_entity_id: %s (%d)", example.subject_entity[0],
                      entity2id.get(example.subject_entity[0], None))
      tf.logging.info("answer entity: %s", str(example.answer_entity))

    feature = InputFeatures(
        qas_id=example.qas_id.encode("utf-8"),
        qry_tokens=qry_tokens,
        qry_input_ids=qry_input_ids,
        qry_input_mask=qry_input_mask,
        relation_input_ids=relation_input_ids,
        relation_input_mask=relation_input_mask,
        qry_entity_id=[entity2id.get(ee, 0) for ee in example.subject_entity],
        answer_mention=example.answer_mention,
        answer_entity=example.answer_entity,
        bridge_mention=example.bridge_mention,
        bridge_entity=example.bridge_entity)

    # Run callback
    output_fn(feature) 
Example #26
Source File: loader.py    From text_bert_cnn with MIT License 4 votes vote down vote up
def convert_examples_to_features(examples,label_list, max_seq_length,tokenizer):
    """
    将所有的InputExamples样本数据转化成模型要输入的token形式,最后输出bert模型需要的四个变量;
    input_ids:就是text_a(分类文本)在词库对应的token,按字符级;
    input_mask:bert模型mask训练的标记,都为1;
    segment_ids:句子标记,此场景只有text_a,都为0;
    label_ids:文本标签对应的token,不是one_hot的形式;
    """
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    input_data=[]
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]
        if ex_index < 3:
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))

        features = collections.OrderedDict()
        features["input_ids"] = input_ids
        features["input_mask"] = input_mask
        features["segment_ids"] = segment_ids
        features["label_ids"] =label_id
        input_data.append(features)

    return input_data