Python pycrfsuite.Trainer() Examples

The following are 30 code examples of pycrfsuite.Trainer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pycrfsuite , or try the search function .
Example #1
Source File: crf.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def train(self, train_data, model_file):
        '''
        Train the CRF tagger using CRFSuite  
        :params train_data : is the list of annotated sentences.        
        :type train_data : list (list(tuple(str,str)))
        :params model_file : the model will be saved to this file.     
         
        '''
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)
        
        for sent in train_data:
            tokens,labels = zip(*sent)
            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
            trainer.append(features,labels)
                        
        # Now train the model, the output should be model_file
        trainer.train(model_file)
        # Save the model file
        self.set_model_file(model_file) 
Example #2
Source File: test_tagger.py    From python-crfsuite with MIT License 6 votes vote down vote up
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq 
Example #3
Source File: crf.py    From webQA_sequence_labelling_pytorch with MIT License 6 votes vote down vote up
def train_crf(x_train, y_train):
    print('Training...')
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(x_train, y_train):
        trainer.append(xseq, yseq)
    
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train(param.crf_path) 
Example #4
Source File: test_trainer.py    From python-crfsuite with MIT License 6 votes vote down vote up
def test_training_messages(tmpdir, xseq, yseq):

    class CapturingTrainer(Trainer):
        def __init__(self):
            self.messages = []

        def message(self, message):
            self.messages.append(message)

    trainer = CapturingTrainer()
    trainer.select('lbfgs')
    trainer.append(xseq, yseq)
    assert not trainer.messages

    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    assert trainer.messages
    assert 'type: CRF1d\n' in trainer.messages
    # print("".join(trainer.messages)) 
Example #5
Source File: test_trainer.py    From python-crfsuite with MIT License 6 votes vote down vote up
def test_training_messages_exception(tmpdir, xseq, yseq):

    class MyException(Exception):
        pass

    class BadTrainer(Trainer):
        def message(self, message):
            raise MyException("error")

    trainer = BadTrainer()
    trainer.select('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))

    with pytest.raises(MyException):
        trainer.train(model_filename) 
Example #6
Source File: crfsuiteutil.py    From estnltk with GNU General Public License v2.0 6 votes vote down vote up
def train(self, nerdocs, mode_filename):
        """Train a CRF model using given documents.

        Parameters
        ----------
        nerdocs: list of estnltk.estner.ner.Document.
            The documents for model training.
        mode_filename: str
            The fielname where to save the model.
        """

        trainer = pycrfsuite.Trainer(algorithm=self.algorithm,
                                     params={'c2': self.c2},
                                     verbose=self.verbose)

        for doc in nerdocs:
            for snt in doc.sentences:
                xseq = [t.feature_list() for t in snt]
                yseq = [t.label for t in snt]
                trainer.append(xseq, yseq)

        trainer.train(mode_filename) 
Example #7
Source File: crf.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def train(self, train_data, model_file):
        '''
        Train the CRF tagger using CRFSuite
        :params train_data : is the list of annotated sentences.
        :type train_data : list (list(tuple(str,str)))
        :params model_file : the model will be saved to this file.

        '''
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)

        for sent in train_data:
            tokens, labels = zip(*sent)
            features = [self._feature_func(tokens, i) for i in range(len(tokens))]
            trainer.append(features, labels)

        # Now train the model, the output should be model_file
        trainer.train(model_file)
        # Save the model file
        self.set_model_file(model_file) 
Example #8
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_help_invalid_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')

    # This segfaults without a workaround;
    # see https://github.com/chokkan/crfsuite/pull/21
    with pytest.raises(ValueError):
        trainer.help('foo')

    with pytest.raises(ValueError):
        trainer.help('c1') 
Example #9
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_params_and_help():
    trainer = Trainer()

    trainer.select('lbfgs')
    assert 'c1' in trainer.params()
    assert 'c2' in trainer.params()
    assert 'num_memories' in trainer.params()
    assert 'L1' in trainer.help('c1')

    trainer.select('l2sgd')
    assert 'c2' in trainer.params()
    assert 'c1' not in trainer.params()
    assert 'L2' in trainer.help('c2') 
Example #10
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_trainer_select_raises_error():
    trainer = Trainer()
    with pytest.raises(ValueError):
        trainer.select('foo') 
Example #11
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_get_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')
    assert abs(trainer.get('c2') - 0.1) > 1e-6
    trainer.set('c2', 0.1)
    assert abs(trainer.get('c2') - 0.1) < 1e-6 
Example #12
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_set_parameters_in_constructor():
    trainer = Trainer(params={'c2': 100})
    assert abs(trainer.get('c2') - 100) < 1e-6 
Example #13
Source File: crf_trainer.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def __init__(self, feature_func_list=None):
        self.crf_trainer = pycrfsuite.Trainer(verbose=False)

        self.feature_func_list = feature_func_list

        if not self.feature_func_list:
            self.feature_func_list = default_feature_func_list 
Example #14
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_trainer_noselect_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename) 
Example #15
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_trainer_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    trainer.select('lbfgs')
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename) 
Example #16
Source File: test_trainer.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_trainer(tmpdir, xseq, yseq):
    trainer = Trainer('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))
    assert not os.path.isfile(model_filename)
    trainer.train(model_filename)
    assert os.path.isfile(model_filename) 
Example #17
Source File: CRF.py    From indic_tagger with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_path):
		self.trainer = pycrfsuite.Trainer(verbose=False)
		self.model_path = model_path
		self.trainer.set_params({
            'c1': 1.0,   # coefficient for L1 penalty
            'c2': 1-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
            }) 
Example #18
Source File: conftest.py    From python-crfsuite with MIT License 5 votes vote down vote up
def model_filename(tmpdir, xseq, yseq):
    from pycrfsuite import Trainer
    trainer = Trainer('lbfgs', verbose=False)
    trainer.append(xseq, yseq)
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    return model_filename 
Example #19
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}],
        ['spam', 'egg', 'spam', 'spam']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0 
Example #20
Source File: entity_extractor_worker.py    From texta with GNU General Public License v3.0 5 votes vote down vote up
def _train_and_save(self, X_train, y_train):
        trainer = Trainer(verbose=False)
        for i, (xseq, yseq) in enumerate(zip(X_train, y_train)):
            # Check how much memory left, stop adding more data if too little
            if i % 2500 == 0:
                if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory:
                    print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory))
                    self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i)

                    log_dict = {
                        'task': 'EntityExtractorWorker:_train_and_save',
                        'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i),
                        'data': {'task_id': self.task_id}
                    }
                    self.info_logger.info("Memory", extra=log_dict)
                    break
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 0.5,  # coefficient for L1 penalty
            'c2': 1e-4,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            # transitions that are possible, but not observed
            'feature.possible_transitions': True})

        output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type)
        # Train and save
        trainer.train(output_model_path)
        return trainer 
Example #21
Source File: crf.py    From webQA_sequence_labelling_pytorch with MIT License 5 votes vote down vote up
def test():
    X_train = [[{'foo': 1, 'bar': 0, 's':0, 'p': 4, 'd':True, 'a':0.7, 'b': 0.5, 'c': 9}, 
            {'foo': 0, 'baz': 1, 's':0, 'p': 0, 'd': False, 'a':8.7, 'b': 7.5, 'c': 1}]]
    X_train = [[['foo=1', 'bar=0', 'c=9', 's=0', 'sd=12', 'cd=2', 'ca=3', 'd=True', 'cc=89'], 
            ['foo=4', 'bar=7', 'c=3', 's=1', 'sd=8', 'cd=9', 'ca=1','d=False', 'cc=18']]]
    y_train = [['0', '1']]
    #print('x train: ', y_train[0])


    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        print('x: ', xseq)
        print('y: ', yseq)
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })



    trainer.train('conll2002-esp.crfsuite')
    #print (len(trainer.logparser.iterations), trainer.logparser.iterations[-1])


    tagger = pycrfsuite.Tagger()
    tagger.open('conll2002-esp.crfsuite')

    print("Predicted:", ' '.join(tagger.tag(X_train[0])))
    print("Correct:  ", ' '.join(y_train[0])) 
Example #22
Source File: entity_extractor.py    From ai-chatbot-framework with MIT License 5 votes vote down vote up
def train(self, train_sentences, model_name):
        """
        Train NER model for given model
        :param train_sentences:
        :param model_name:
        :return:
        """
        features = [self.sent_to_features(s) for s in train_sentences]
        labels = [self.sent_to_labels(s) for s in train_sentences]

        trainer = pycrfsuite.Trainer(verbose=False)
        for xseq, yseq in zip(features, labels):
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
        trainer.train('model_files/%s.model' % model_name)
        return True

    # Extract Labels from BIO tagged sentence 
Example #23
Source File: extractor.py    From HotPepperGourmetDialogue with MIT License 5 votes vote down vote up
def train(self, train_x, train_y, save_file='model.crfsuite'):
        trainer = pycrfsuite.Trainer(verbose=False)
        for xseq, yseq in zip(train_x, train_y):
            trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 1.0,   # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            'feature.possible_transitions': True
        })
        trainer.train(save_file)
        self.__tagger.open(save_file) 
Example #24
Source File: tag.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def train(self, sentences, model):
        """Train the CRF tagger using CRFSuite.

        :params sentences: Annotated sentences.
        :params model: Path to save pickled model.
        """
        trainer = pycrfsuite.Trainer(verbose=True)
        trainer.set_params(self.params)
        for sentence in sentences:
            tokens, labels = zip(*sentence)
            features = [self._get_features(tokens, i) for i in range(len(tokens))]
            trainer.append(features, labels)
        trainer.train(model)
        self.load(model) 
Example #25
Source File: crf_pos_tagger.py    From Jiayan with MIT License 5 votes vote down vote up
def train(self, train_x, train_y, out_model):
        trainer = pycrfsuite.Trainer(verbose=False)
        for x, y in zip(train_x, train_y):
            if x and y:
                trainer.append(x, y)

        trainer.set_params({
            'c1': 1.0,                            # coefficient for L1 penalty
            'c2': 1e-3,                           # coefficient for L2 penalty
            'max_iterations': 50,                 # stop earlier
            'feature.possible_transitions': True  # include transitions that are possible, but not observed
        })

        trainer.train(out_model)
        print(trainer.logparser.last_iteration) 
Example #26
Source File: crf_sent_tagger.py    From Jiayan with MIT License 5 votes vote down vote up
def train(self, train_x, train_y, out_model):
        trainer = pycrfsuite.Trainer(verbose=False)
        for x, y in zip(train_x, train_y):
            if x and y:
                trainer.append(x, y)

        trainer.set_params({
            'c1': 1.0,                            # coefficient for L1 penalty
            'c2': 1e-3,                           # coefficient for L2 penalty
            'max_iterations': 50,                 # stop earlier
            'feature.possible_transitions': True  # include transitions that are possible, but not observed
        })

        trainer.train(out_model)
        print(trainer.logparser.last_iteration) 
Example #27
Source File: pycrfsuite.py    From TBBTCorpus with Apache License 2.0 5 votes vote down vote up
def __init__(self, enumerations=100, L1Penalty=1.0, L2Penalty=1e-3):
        self.crf_feature_train = crf.Trainer(verbose=False)
        self.crf_feature_train.set_params({
            'c1': L1Penalty,
            'c2': L2Penalty,
            'max_iterations': enumerations,
            'feature.possible_transitions': True
        })
    
    #Method to append more features to the trainer
    #More features include TOKEN and its respective POS
    #It also includes the act tag of the sentence 
Example #28
Source File: training.py    From parserator with MIT License 5 votes vote down vote up
def trainModel(training_data, module, model_path,
               params_to_set={'c1':0.1, 'c2':0.01, 'feature.minfreq':0}):

    trainer = pycrfsuite.Trainer(verbose=False, params=params_to_set)

    for _, components in training_data:
        tokens, labels = list(zip(*components))
        trainer.append(module.tokens2features(tokens), labels)

    trainer.train(model_path) 
Example #29
Source File: utils.py    From parserator with MIT License 5 votes vote down vote up
def fit(self, X, y, **params, model_path):
        # sklearn requires parameters to be declared as fields of the estimator,
        # an we can't have a full stop there. Replace with an underscore
        params = {k.replace('_', '.'): v for k, v in self.__dict__.items()}
        trainer = pycrfsuite.Trainer(verbose=False, params=params)
        for raw_text, labels in zip(X, y):
            tokens = tokenize(raw_text)
            trainer.append(tokens2features(tokens), labels)
        trainer.train(model_path)
        reload(parserator) 
Example #30
Source File: test_tagger.py    From python-crfsuite with MIT License 4 votes vote down vote up
def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [
            {
                "foo": {
                    "bar": "baz",
                    "spam": 0.5,
                    "egg": ["x", "y"],
                    "ham": {"x": -0.5, "y": -0.1}
                },
            },
            {
                "foo": {
                    "bar": "ham",
                    "spam": -0.5,
                    "ham": set(["x", "y"])
                },
            },
        ],
        ['first', 'second']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0