Python allennlp.data.fields.SequenceLabelField() Examples

The following are 28 code examples of allennlp.data.fields.SequenceLabelField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #2
Source File: summarization_sentence_tagger_reader.py    From summarus with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
        if sentences is None:
            if self._language == "ru":
                sentences = [s.text for s in razdel.sentenize(text)]
            else:
                sentences = nltk.tokenize.sent_tokenize(text)
        sentences_tokens = []
        for sentence in sentences[:self._max_sentences_count]:
            sentence = sentence.lower() if self._lowercase else sentence
            tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            indexed_tokens = TextField(tokens, self._source_token_indexers)
            sentences_tokens.append(indexed_tokens)

        sentences_tokens_indexed = ListField(sentences_tokens)
        result = {'source_sentences': sentences_tokens_indexed}

        if tags:
            result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
        return Instance(result) 
Example #3
Source File: template_text2sql.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def text_to_instance(
        self,  # type: ignore
        query: List[str],
        slot_tags: List[str] = None,
        sql_template: str = None,
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if slot_tags is not None and sql_template is not None:
            slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags")
            template = LabelField(sql_template, label_namespace="template_labels")
            fields["slot_tags"] = slot_field
            fields["template"] = template

        return Instance(fields) 
Example #4
Source File: prolocal_dataset_reader.py    From propara with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields) 
Example #5
Source File: dataset_reader.py    From nanigonet with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields) 
Example #6
Source File: semantic_role_labeling.py    From magnitude with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens             ,
                         verb_label           ,
                         tags            = None)            :
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens],
                                            u"verb": verb})
        return Instance(fields) 
Example #7
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #8
Source File: vcr.py    From r2c with MIT License 5 votes vote down vote up
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format(
                        tokenized_sent, old_det_to_new_ind
                    ))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags 
Example #9
Source File: depend_parse.py    From glyce with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         words: List[str],
                         upos_tags: List[str],
                         dependencies: List[Tuple[str, int]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": words, "pos": upos_tags})
        return Instance(fields) 
Example #10
Source File: rebalance_dataset_ensemble.py    From swagaf with MIT License 5 votes vote down vote up
def collate(self, items_l):
        # Assume all of these have the same length
        index_l, second_sentences_l, pos_tags_l, feats_l, context_len_l = zip(*items_l)

        feats = Variable(torch.FloatTensor(np.stack(feats_l)))
        inds = np.array(index_l)

        instances = []
        for second_sentences, pos_tags, context_len in zip(second_sentences_l, pos_tags_l, context_len_l):
            for second_sent, pos_tag in zip(second_sentences, pos_tags):
                instance_d = {
                    'words': TextField([Token(token) for token in ['@@bos@@'] + second_sent + ['@@eos@@']],
                                       {'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True)}),
                    'postags': TextField([Token(token) for token in ['@@bos@@'] + pos_tag + ['@@eos@@']],
                                         {'pos': SingleIdTokenIndexer(namespace='pos', lowercase_tokens=False)}),
                }
                instance_d['context_indicator'] = SequenceLabelField([1] * (context_len + 1) +
                                                                     [0] * (len(second_sent) - context_len + 1),
                                                                     instance_d['words'])
                instances.append(Instance(instance_d))
        batch = Batch(instances)
        batch.index_instances(vocab)
        tensor_dict = batch.as_tensor_dict(for_training=self.train)

        # instances_mask = torch.LongTensor(np.stack([np.array([len(sub_g) > 0 for sub_g in g], dtype=np.int64)
        #                                             for g in selected_gens]))
        return {
            'lm_feats': feats,
            'inds': inds,
            'ending_word_ids': tensor_dict['words']['tokens'].view(inds.shape[0], -1,
                                                                   tensor_dict['words']['tokens'].size(1)),
            'postags_word_ids': tensor_dict['postags']['pos'].view(inds.shape[0], -1,
                                                                   tensor_dict['postags']['pos'].size(1)),
            'ctx_indicator': tensor_dict['context_indicator'].view(inds.shape[0], -1,
                                                                   tensor_dict['context_indicator'].size(1)),
        } 
Example #11
Source File: conll_reader.py    From allennlp_tutorial with MIT License 5 votes vote down vote up
def text_to_instance(self,
                         words: List[str],
                         ner_tags: List[str]) -> Instance:
        fields: Dict[str, Field] = {}
        # wrap each token in the file with a token object
        tokens = TextField([Token(w) for w in words], self._token_indexers)

        # Instances in AllenNLP are created using Python dictionaries,
        # which map the token key to the Field type
        fields["tokens"] = tokens
        fields["label"] = SequenceLabelField(ner_tags, tokens)

        return Instance(fields) 
Example #12
Source File: vcr.py    From HGL-pytorch with MIT License 5 votes vote down vote up
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format(
                        tokenized_sent, old_det_to_new_ind
                    ))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags 
Example #13
Source File: ebmnlp.py    From scibert with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         tokens: List[Token],
                         pico_tags: List[str] = None):
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        
        # Set the field 'labels' according to the specified PIO element
        if pico_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace)

        return Instance(instance_fields) 
Example #14
Source File: relation_instances_reader.py    From comb_dist_direct_relex with Apache License 2.0 5 votes vote down vote up
def _tokens_distances_fields(self, tokens):
        """Returns the updated list of tokens and entity distances for the first and second entity as fields."""
        tokens, positions1, positions2 = self._tokens_distances(tokens)
        t_f = TextField(tokens, self._token_indexers)
        p1_f = SequenceLabelField(positions1, t_f)
        p2_f = SequenceLabelField(positions2, t_f)
        return t_f, p1_f, p2_f 
Example #15
Source File: sequence_tagging.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(  # type: ignore
        self, tokens: List[Token], tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields) 
Example #16
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace(u"this", u"words")
        self.vocab.add_token_to_namespace(u"is", u"words")
        self.vocab.add_token_to_namespace(u"a", u"words")
        self.vocab.add_token_to_namespace(u"sentence", u'words')
        self.vocab.add_token_to_namespace(u"s", u'characters')
        self.vocab.add_token_to_namespace(u"e", u'characters')
        self.vocab.add_token_to_namespace(u"n", u'characters')
        self.vocab.add_token_to_namespace(u"t", u'characters')
        self.vocab.add_token_to_namespace(u"c", u'characters')
        for label in [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k']:
            self.vocab.add_token_to_namespace(label, u'labels')

        self.word_indexer = {u"words": SingleIdTokenIndexer(u"words")}
        self.words_and_characters_indexers = {u"words": SingleIdTokenIndexer(u"words"),
                                              u"characters": TokenCharactersIndexer(u"characters")}
        self.field1 = TextField([Token(t) for t in [u"this", u"is", u"a", u"sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in [u"this", u"is", u"another", u"sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp() 
Example #17
Source File: universal_dependencies.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         words           ,
                         upos_tags           ,
                         dependencies                        = None)            :
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields                   = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
        return Instance(fields) 
Example #18
Source File: datareader.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
                         words: List[str] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": words})
        if tags is not None:
            labels, detect_tags, complex_flag_dict = self.extract_tags(tags)
            if self._skip_complex and complex_flag_dict[self._skip_complex] > 0:
                return None
            rnd = random()
            # skip TN
            if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
                if rnd > self._tn_prob:
                    return None
            # skip TP
            else:
                if rnd > self._tp_prob:
                    return None

            fields["labels"] = SequenceLabelField(labels, sequence,
                                                  label_namespace="labels")
            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
                                                  label_namespace="d_tags")
        return Instance(fields) 
Example #19
Source File: sequence_tagging.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields) 
Example #20
Source File: ontonotes_ner.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields) 
Example #21
Source File: input_reduction.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def _get_ner_tags_and_mask(
    instance: Instance, input_field_to_attack: str, ignore_tokens: List[str]
):
    """
    Used for the NER task. Sets the num_ignore tokens, saves the original predicted tag and a 0/1
    mask in the position of the tags
    """
    # Set num_ignore_tokens
    num_ignore_tokens = 0
    input_field: TextField = instance[input_field_to_attack]  # type: ignore
    for token in input_field.tokens:
        if str(token) in ignore_tokens:
            num_ignore_tokens += 1

    # save the original tags and a 0/1 mask where the tags are
    tag_mask = []
    original_tags = []
    tag_field: SequenceLabelField = instance["tags"]  # type: ignore
    for label in tag_field.labels:
        if label != "O":
            tag_mask.append(1)
            original_tags.append(label)
            num_ignore_tokens += 1
        else:
            tag_mask.append(0)
    return num_ignore_tokens, tag_mask, original_tags 
Example #22
Source File: conll2003.py    From magnitude with MIT License 4 votes vote down vote up
def text_to_instance(self, # type: ignore
                         tokens             ,
                         pos_tags            = None,
                         chunk_tags            = None,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == u"BIOUL":
            coded_chunks = to_bioul(chunk_tags) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if u'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(u"Dataset reader was specified to use pos_tags as "
                                         u"features. Pass them to text_to_instance.")
            instance_fields[u'pos_tags'] = SequenceLabelField(pos_tags, sequence, u"pos_tags")
        if u'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(u"Dataset reader was specified to use chunk tags as "
                                         u"features. Pass them to text_to_instance.")
            instance_fields[u'chunk_tags'] = SequenceLabelField(coded_chunks, sequence, u"chunk_tags")
        if u'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(u"Dataset reader was specified to use NER tags as "
                                         u" features. Pass them to text_to_instance.")
            instance_fields[u'ner_tags'] = SequenceLabelField(coded_ner, sequence, u"ner_tags")

        # Add "tag label" to instance
        if self.tag_label == u'ner' and coded_ner is not None:
            instance_fields[u'tags'] = SequenceLabelField(coded_ner, sequence)
        elif self.tag_label == u'pos' and pos_tags is not None:
            instance_fields[u'tags'] = SequenceLabelField(pos_tags, sequence)
        elif self.tag_label == u'chunk' and coded_chunks is not None:
            instance_fields[u'tags'] = SequenceLabelField(coded_chunks, sequence)

        return Instance(instance_fields) 
Example #23
Source File: universal_dependencies.py    From udify with MIT License 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         words: List[str],
                         lemmas: List[str] = None,
                         lemma_rules: List[str] = None,
                         upos_tags: List[str] = None,
                         xpos_tags: List[str] = None,
                         feats: List[str] = None,
                         dependencies: List[Tuple[str, int]] = None,
                         ids: List[str] = None,
                         multiword_ids: List[str] = None,
                         multiword_forms: List[str] = None) -> Instance:
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["tokens"] = tokens

        names = ["upos", "xpos", "feats", "lemmas"]
        all_tags = [upos_tags, xpos_tags, feats, lemma_rules]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field, tokens, label_namespace=name)

        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({
            "words": words,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "lemma_rules": lemma_rules,
            "ids": ids,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms
        })

        return Instance(fields) 
Example #24
Source File: list_field_test.py    From allennlp with Apache License 2.0 4 votes vote down vote up
def setup_method(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer
        )
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer
        )
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer
        )

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setup_method() 
Example #25
Source File: transition_eds_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         pos_tags: List[str] = None,
                         arc_indices: List[Tuple[int, int]] = None,
                         arc_tags: List[str] = None,
                         gold_actions: List[str] = None,
                         root_id: List[int] = None,
                         meta_info: List[str] = None,
                         concept_label: List[int] = None,
                         tokens_range: List[Tuple[int, int]] = None,
                         gold_mrps: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)

        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
        if pos_tags is not None:
            fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos")

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers)

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if concept_label is not None:
            meta_dict["concept_label"] = concept_label
            fields["concept_label"] = TextField([Token(a) for a in concept_label], self._concept_label_indexers)

        if root_id is not None:
            meta_dict["root_id"] = root_id[0]

        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields) 
Example #26
Source File: transition_sdp_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         mrp_pos_tags: List[str] = None,
                         arc_indices: List[Tuple[int, int]] = None,
                         arc_tags: List[str] = None,
                         gold_actions: List[str] = None,
                         meta_info: List[str] = None,
                         tokens_range: List[Tuple[int, int]] = None,
                         frame: List[str] = None,
                         pos_tag: List[str] = None,
                         node_label: List[str] = None,
                         gold_mrps: List[str] = None) -> Instance:

        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)
        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)

        if mrp_pos_tags is not None:
            fields["mrp_pos_tags"] = SequenceLabelField(mrp_pos_tags, token_field, label_namespace="pos")

        if frame is not None:
            fields["frame"] = SequenceLabelField(frame, token_field, label_namespace="frame")

        if pos_tag is not None:
            fields["pos_tag"] = SequenceLabelField(pos_tag, token_field, label_namespace="pos_tag")

        if node_label is not None:
            fields["node_label"] = SequenceLabelField(node_label, token_field, label_namespace="node_label")

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers)

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]

        fields["metadata"] = MetadataField(meta_dict)
        return Instance(fields) 
Example #27
Source File: transition_amr_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         pos_tags: List[str] = None,
                         gold_actions: List[List[str]] = None,
                         id: str = None,
                         amr: str = None,
                         input: str = None,
                         mrp: str = None,
                         companion: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)
        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}
        if id:
            meta_dict["id"] = id
        if amr:
            meta_dict["amr"] = amr
        if input:
            meta_dict["input"] = input
        if mrp:
            meta_dict["mrp"] = json.loads(mrp)
        if companion:
            meta_dict["companion"] = json.loads(companion)

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
        if pos_tags is not None:
            fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos")

        if gold_actions is not None:
            meta_dict["gold_actions"] = ['@@:@@'.join(a) for a in gold_actions]
            fields["gold_actions"] = TextField([Token('@@:@@'.join(a)) for a in gold_actions],
                                               {'actions': SingleIdTokenIndexer(namespace='actions')})
            fields["gold_newnodes"] = TextField(
                [Token(a[1] if a[0] == 'NEWNODE' else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'newnodes': SingleIdTokenIndexer(namespace='newnodes')})
            fields["gold_entities"] = TextField(
                [Token(a[1] if a[0] == 'ENTITY' else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'entities': SingleIdTokenIndexer(namespace='entities')})
            fields["gold_relations"] = TextField(
                [Token(a[1] if a[0] in ['LEFT', 'RIGHT'] else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'relations': SingleIdTokenIndexer(namespace='relations')})
        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields) 
Example #28
Source File: input_reduction.py    From allennlp with Apache License 2.0 4 votes vote down vote up
def _remove_one_token(
    instance: Instance,
    input_field_to_attack: str,
    grads: np.ndarray,
    ignore_tokens: List[str],
    beam_size: int,
    tag_mask: List[int],
) -> List[Tuple[Instance, int, List[int]]]:
    """
    Finds the token with the smallest gradient and removes it.
    """
    # Compute L2 norm of all grads.
    grads_mag = [np.sqrt(grad.dot(grad)) for grad in grads]

    # Skip all ignore_tokens by setting grad to infinity
    text_field: TextField = instance[input_field_to_attack]  # type: ignore
    for token_idx, token in enumerate(text_field.tokens):
        if token in ignore_tokens:
            grads_mag[token_idx] = float("inf")

    # For NER, skip all tokens that are not in outside
    if "tags" in instance:
        tag_field: SequenceLabelField = instance["tags"]  # type: ignore
        labels: List[str] = tag_field.labels  # type: ignore
        for idx, label in enumerate(labels):
            if label != "O":
                grads_mag[idx] = float("inf")
    reduced_instances_and_smallest: List[Tuple[Instance, int, List[int]]] = []
    for _ in range(beam_size):
        # copy instance and edit later
        copied_instance = deepcopy(instance)
        copied_text_field: TextField = copied_instance[input_field_to_attack]  # type: ignore

        # find smallest
        smallest = np.argmin(grads_mag)
        if grads_mag[smallest] == float("inf"):  # if all are ignored tokens, return.
            break
        grads_mag[smallest] = float("inf")  # so the other beams don't use this token

        # remove smallest
        inputs_before_smallest = copied_text_field.tokens[0:smallest]
        inputs_after_smallest = copied_text_field.tokens[smallest + 1 :]
        copied_text_field.tokens = inputs_before_smallest + inputs_after_smallest

        if "tags" in instance:
            tag_field: SequenceLabelField = copied_instance["tags"]  # type: ignore
            tag_field_before_smallest = tag_field.labels[0:smallest]
            tag_field_after_smallest = tag_field.labels[smallest + 1 :]
            tag_field.labels = tag_field_before_smallest + tag_field_after_smallest  # type: ignore
            tag_field.sequence_field = copied_text_field

        copied_instance.indexed = False
        reduced_instances_and_smallest.append((copied_instance, smallest, tag_mask))

    return reduced_instances_and_smallest