Python allennlp.data.fields.ListField() Examples

The following are 30 code examples of allennlp.data.fields.ListField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: arc_multichoice_json_reader.py    From ARC-Solvers with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
           "id": item_id,
           "question_text": question_text,
           "choice_text_list": choice_text_list,
           "question_tokens": [x.text for x in question_tokens],
           "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #2
Source File: array_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = (
            list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        )
        correct_tensor = numpy.array(
            [
                [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]],
                [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
                [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
            ]
        )
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #3
Source File: list_field_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths[u"list_num_tokens"] = 7
        padding_lengths[u"num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][0].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][1].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][2].detach().cpu().numpy(),
                                                numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][3].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][4].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0])) 
Example #4
Source File: summarization_sentence_tagger_reader.py    From summarus with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
        if sentences is None:
            if self._language == "ru":
                sentences = [s.text for s in razdel.sentenize(text)]
            else:
                sentences = nltk.tokenize.sent_tokenize(text)
        sentences_tokens = []
        for sentence in sentences[:self._max_sentences_count]:
            sentence = sentence.lower() if self._lowercase else sentence
            tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            indexed_tokens = TextField(tokens, self._source_token_indexers)
            sentences_tokens.append(indexed_tokens)

        sentences_tokens_indexed = ListField(sentences_tokens)
        result = {'source_sentences': sentences_tokens_indexed}

        if tags:
            result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
        return Instance(result) 
Example #5
Source File: test_dict_field.py    From kb with Apache License 2.0 5 votes vote down vote up
def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        instance3_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8],
                                                           [1.0, 0.0, 0.0],
                                                           [0.33, 0.67, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)],
            )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(self.instance2_fields)])}),
                     Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(instance3_fields)])})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape) 
Example #6
Source File: single_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premises: List[str],
                         hypotheses: List[str],
                         answer_index: int = None,
                         relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
                           for premise in premises]
        hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
                             for hypothesis in hypotheses]
        if premises:
            premises_text_fields = [TextField(premise_tokens, self._token_indexers)
                                    for premise_tokens in premises_tokens]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers)
                                for hypothesis_tokens in hypotheses_tokens]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask))

        # If entailment labels are available
        if answer_index is not None:
            if answer_index not in range(0, len(hypotheses)):
                raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses)))
            fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long)

        paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens]
        paragraph_text_field = TextField(paragraph_tokens, self._token_indexers)
        fields['paragraph'] = paragraph_text_field
        return Instance(fields) 
Example #7
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., -1., -1.],
                                       [1., 1., 1., -1., -1.]],
                                      [[1., 1., 1., 1., 1.],
                                       [-1., -1., -1., -1., -1.]],
                                      [[-1., -1., -1., -1., -1.],
                                       [-1., -1., -1., -1., -1.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #8
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_padding_handles_list_fields(self):
        array1 = ArrayField(numpy.ones([2, 3]))
        array2 = ArrayField(numpy.ones([1, 5]))
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., 0., 0.],
                                       [1., 1., 1., 0., 0.]],
                                      [[1., 1., 1., 1., 1.],
                                       [0., 0., 0., 0., 0.]],
                                      [[0., 0., 0., 0., 0.],
                                       [0., 0., 0., 0., 0.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #9
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field) 
Example #10
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict[u"words"].detach().cpu().numpy()
        characters = tensor_dict[u"characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0],
                                                                    [2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]])) 
Example #11
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict[u"words"].detach().cpu().numpy()
        characters = tensor_dict[u"characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5],
                                                                    [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 4, 1, 5, 1, 3, 1, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]])) 
Example #12
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']])
        nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                   [0, 1, 2, 3, 4, -1],
                                                   [5, 6, 7, 8, 9, 10]]) 
Example #13
Source File: arc_multichoice_json_reader.py    From OpenBookQA with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int
                         ) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "choice_text_list": choice_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #14
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) 
Example #15
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField([self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                         numpy.array([[2, 3, 4, 5, 0],
                                                      [2, 3, 4, 1, 5],
                                                      [0, 0, 0, 0, 0]])) 
Example #16
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {u"num_fields": 3, u"list_num_tokens": 5} 
Example #17
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField([self.sequence_label_field,
                                self.sequence_label_field,
                                self.empty_sequence_label_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                         numpy.array([[1, 1, 0, 1],
                                                      [1, 1, 0, 1],
                                                      [0, 0, 0, 0]])) 
Example #18
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict() 
Example #19
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_sequence_methods(self):
        list_field = ListField([self.field1, self.field2, self.field3])

        assert len(list_field) == 3
        assert list_field[1] == self.field2
        assert [f for f in list_field] == [self.field1, self.field2, self.field3] 
Example #20
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field) 
Example #21
Source File: babi.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(
        self,  # type: ignore
        context: List[List[str]],
        question: List[str],
        answer: str,
        supports: List[int],
    ) -> Instance:

        fields: Dict[str, Field] = {}

        if self._keep_sentences:
            context_field_ks = ListField(
                [
                    TextField([Token(word) for word in line], self._token_indexers)
                    for line in context
                ]
            )

            fields["supports"] = ListField(
                [IndexField(support, context_field_ks) for support in supports]
            )
        else:
            context_field = TextField(
                [Token(word) for line in context for word in line], self._token_indexers
            )

        fields["context"] = context_field_ks if self._keep_sentences else context_field
        fields["question"] = TextField([Token(word) for word in question], self._token_indexers)
        fields["answer"] = TextField([Token(answer)], self._token_indexers)

        return Instance(fields) 
Example #22
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_words___tokens"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][0].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][1].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][2].detach().cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][3].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][4].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]),
        ) 
Example #23
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]])
        nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {"num_fields": 3, "list_num_fields": 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(
            tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]
        ) 
Example #24
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])
        ) 
Example #25
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField(
            [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]
        )
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])
        ) 
Example #26
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])
        ) 
Example #27
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "list_words___tokens": 5} 
Example #28
Source File: elmo_indexer_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_elmo_empty_token_list(self):
        # Basic test
        indexer = ELMoTokenCharactersIndexer()
        assert {"elmo_tokens": []} == indexer.get_empty_token_list()
        # Real world test
        indexer = {"elmo": indexer}
        tokens_1 = TextField([Token("Apple")], indexer)
        targets_1 = ListField([TextField([Token("Apple")], indexer)])
        tokens_2 = TextField([Token("Screen"), Token("device")], indexer)
        targets_2 = ListField(
            [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)]
        )
        instance_1 = Instance({"tokens": tokens_1, "targets": targets_1})
        instance_2 = Instance({"tokens": tokens_2, "targets": targets_2})
        a_batch = Batch([instance_1, instance_2])
        a_batch.index_instances(Vocabulary())
        batch_tensor = a_batch.as_tensor_dict()
        elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"]
        # The TextField that is empty should have been created using the
        # `get_empty_token_list` and then padded with zeros.
        empty_target = elmo_target_token_indices[0][1].numpy()
        np.testing.assert_array_equal(np.zeros((1, 50)), empty_target)
        non_empty_targets = [
            elmo_target_token_indices[0][0],
            elmo_target_token_indices[1][0],
            elmo_target_token_indices[1][1],
        ]
        for non_empty_target in non_empty_targets:
            with pytest.raises(AssertionError):
                np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target) 
Example #29
Source File: text_classification_json.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(
        self, text: str, label: Union[str, int] = None
    ) -> Instance:  # type: ignore
        """
        # Parameters

        text : `str`, required.
            The text to classify
        label : `str`, optional, (default = `None`).
            The label for this text.

        # Returns

        An `Instance` containing the following fields:
            - tokens (`TextField`) :
              The tokens in the sentence or phrase.
            - label (`LabelField`) :
              The label label of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(label, skip_indexing=self._skip_label_indexing)
        return Instance(fields) 
Example #30
Source File: wordnet.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         tokens: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         gold_data_ids: List[str] = None):

        # prior needs to be 2D and full
        # can look like [[0.2, 0.8], [1.0]]  if one candidate for second
        # candidate span and two candidates for first
        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)

        fields = {
            "tokens": TextField([Token(t) for t in tokens],
                      token_indexers=self.token_indexers),

            # join by space, then retokenize in the "character indexer"
            "candidate_entities": TextField(
                [Token(" ".join(candidate_list)) for candidate_list in candidate_entities],
                token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array(np_prior)),
            # only one sentence
            "candidate_segment_ids": ArrayField(
                np.array([0] * len(candidate_entities)), dtype=np.int
            )
        }

        if gold_entities is not None:
            fields["gold_entities"] =  TextField([Token(entity) for entity in gold_entities],
                                                  token_indexers=self.entity_indexer)
        if gold_data_ids is not None:
            fields["gold_data_ids"] = MetadataField(gold_data_ids)

        span_fields = []
        for span in candidate_spans:
            span_fields.append(SpanField(span[0], span[1], fields['tokens']))
        fields['candidate_spans'] = ListField(span_fields)

        if self.extra_candidate_generators:
            tokens = " ".join(tokens)
            extra_candidates = {
                    key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True)
                    for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)