Python pycrfsuite.Tagger() Examples

The following are 30 code examples of pycrfsuite.Tagger(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pycrfsuite , or try the search function .
Example #1
Source File: entity_extractor.py    From ai-chatbot-framework with MIT License 6 votes vote down vote up
def predict(self, model_name, sentence):
        """
        Predict NER labels for given model and query
        :param model_name:
        :param sentence:
        :return:
        """
        from app.nlu.tasks import pos_tagger

        doc = spacy_tokenizer(sentence)
        words = [token.text for token in doc]
        tagged_token = pos_tagger(sentence)
        tagger = pycrfsuite.Tagger()
        tagger.open("{}/{}.model".format(app.config["MODELS_DIR"], model_name))
        predicted_labels = tagger.tag(self.sent_to_features(tagged_token))
        extracted_entities = self.crf2json(
            zip(words, predicted_labels))
        return self.replace_synonyms(extracted_entities) 
Example #2
Source File: crf_sent_tagger.py    From Jiayan with MIT License 6 votes vote down vote up
def eval(self, test_x, test_y, crf_model):
        tagger = pycrfsuite.Tagger()
        tagger.open(crf_model)

        y_pred = []
        for feat_list in test_x:
            preds = tagger.tag(feat_list)
            y_pred.append(preds)

        lb = LabelBinarizer()
        y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
        y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = sorted(set(lb.classes_))
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        print(classification_report(
            y_true_all,
            y_pred_all,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
            digits=5
        )) 
Example #3
Source File: crf_pos_tagger.py    From Jiayan with MIT License 6 votes vote down vote up
def eval(self, test_x, test_y, crf_model):
        tagger = pycrfsuite.Tagger()
        tagger.open(crf_model)

        y_pred = []
        for feat_list in test_x:
            preds = tagger.tag(feat_list)
            y_pred.append(preds)

        lb = LabelBinarizer()
        y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
        y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = sorted(set(lb.classes_))
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        print(classification_report(
            y_true_all,
            y_pred_all,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
            digits=5
        )) 
Example #4
Source File: test_tagger.py    From python-crfsuite with MIT License 6 votes vote down vote up
def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq 
Example #5
Source File: model_crf.py    From underthesea with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self):
        self.model = pycrfsuite.Tagger()
        filepath = join(dirname(__file__), "chunk_crf_2017_10_12.bin")
        self.model.open(filepath)

        template = [
            "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower",
            "T[2].lower",
            "T[0].istitle", "T[-1].istitle", "T[1].istitle",
            # word unigram and bigram
            "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
            "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
            # pos unigram and bigram
            "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
            "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
            # chunk
            "T[-3][2]", "T[-2][2]", "T[-1][2]",
        ]
        self.transformer = TaggedTransformer(template) 
Example #6
Source File: model_crf.py    From underthesea with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self):
        self.model = pycrfsuite.Tagger()
        filepath = join(dirname(__file__), "pos_crf_2017_10_11.bin")
        self.model.open(filepath)

        template = [
            "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower",
            "T[2].lower",
            "T[0].istitle", "T[-1].istitle", "T[1].istitle",
            # word unigram and bigram
            "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
            "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
            # pos unigram and bigram
            "T[-3][1]", "T[-2][1]", "T[-1][1]",
            "T[-3,-2][1]", "T[-2,-1][1]",
        ]
        self.transformer = TaggedTransformer(template) 
Example #7
Source File: model_crf.py    From underthesea with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self):
        self.model = pycrfsuite.Tagger()
        filepath = join(dirname(__file__), "ner_crf_2017_10_12.bin")
        self.model.open(filepath)

        template = [
            "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower",
            "T[2].lower",
            "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle",
            "T[2].istitle",
            # word unigram and bigram
            "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
            "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
            # pos unigram and bigram
            "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
            "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
            # ner
            "T[-3][3]", "T[-2][3]", "T[-1][3]",
        ]
        self.transformer = TaggedTransformer(template) 
Example #8
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_non_existing():
    tagger = Tagger()
    with pytest.raises(IOError):
        tagger.open('foo') 
Example #9
Source File: crf.py    From webQA_sequence_labelling_pytorch with MIT License 5 votes vote down vote up
def test():
    X_train = [[{'foo': 1, 'bar': 0, 's':0, 'p': 4, 'd':True, 'a':0.7, 'b': 0.5, 'c': 9}, 
            {'foo': 0, 'baz': 1, 's':0, 'p': 0, 'd': False, 'a':8.7, 'b': 7.5, 'c': 1}]]
    X_train = [[['foo=1', 'bar=0', 'c=9', 's=0', 'sd=12', 'cd=2', 'ca=3', 'd=True', 'cc=89'], 
            ['foo=4', 'bar=7', 'c=3', 's=1', 'sd=8', 'cd=9', 'ca=1','d=False', 'cc=18']]]
    y_train = [['0', '1']]
    #print('x train: ', y_train[0])


    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        print('x: ', xseq)
        print('y: ', yseq)
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })



    trainer.train('conll2002-esp.crfsuite')
    #print (len(trainer.logparser.iterations), trainer.logparser.iterations[-1])


    tagger = pycrfsuite.Tagger()
    tagger.open('conll2002-esp.crfsuite')

    print("Predicted:", ' '.join(tagger.tag(X_train[0])))
    print("Correct:  ", ' '.join(y_train[0])) 
Example #10
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(__file__) 
Example #11
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_invalid_small(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b'foo')
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp)) 
Example #12
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_invalid_small_with_correct_signature(tmpdir):
    tmp = tmpdir.join('tmp.txt')
    tmp.write(b"lCRFfoo")
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open(str(tmp)) 
Example #13
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_inmemory(model_bytes, xseq, yseq):
    with Tagger().open_inmemory(model_bytes) as tagger:
        assert tagger.tag(xseq) == yseq 
Example #14
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_open_inmemory_invalid():
    tagger = Tagger()
    with pytest.raises(ValueError):
        tagger.open_inmemory(b'')

    with pytest.raises(ValueError):
        tagger.open_inmemory(b'lCRFabc') 
Example #15
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag_not_opened(xseq):
    tagger = Tagger()
    with pytest.raises(Exception):
        tagger.tag(xseq) 
Example #16
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq 
Example #17
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag_item_sequence(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(ItemSequence(xseq)) == yseq 
Example #18
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag_bools(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        # Some values are bools:
        # True <=> 1.0; False <=> 0.0
        data = [
            dict((k, bool(v) if v==0 or v==1 else v) for (k, v) in x.items())
            for x in xseq
        ]
        assert tagger.tag(data) == yseq 
Example #19
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag_invalid_feature_format(model_filename, bad_seq):
    with Tagger().open(model_filename) as tagger:
        with pytest.raises(ValueError):
            tagger.tag(bad_seq) 
Example #20
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_tag_probability(model_filename, xseq, yseq):
    with Tagger().open(model_filename) as tagger:
        res = tagger.tag(xseq)
        prob = tagger.probability(res)
        prob2 = tagger.probability([yseq[0]]*len(yseq))
        assert prob > prob2
        assert 0 < prob < 1
        assert 0 < prob2 < 1 
Example #21
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_dump(tmpdir, model_filename):
    with Tagger().open(model_filename) as tagger:
        dump_filename = str(tmpdir.join("dump.txt"))
        tagger.dump(dump_filename)

        with open(dump_filename, 'rb') as f:
            res = f.read().decode('utf8')
            assert 'LABELS = {' in res
            assert u'солнце:не светит --> rainy:' in res

    # it shouldn't segfault on a closed tagger
    with pytest.raises(RuntimeError):
        tagger.dump(dump_filename) 
Example #22
Source File: test_tagger.py    From python-crfsuite with MIT License 5 votes vote down vote up
def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}],
        ['spam', 'egg', 'spam', 'spam']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0 
Example #23
Source File: crf_tokenizer.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def load_model(self):
        self.crf_tagger = pycrfsuite.Tagger()
        self.crf_tagger.open(self.model_file)

        pickle_file = self.get_char2feature_file(self.model_dir)
        with open(pickle_file, 'rb') as fd:
            self.feature_func_list = pickle.load(fd) 
Example #24
Source File: crf_loader.py    From MicroTokenizer with MIT License 5 votes vote down vote up
def from_disk(self, model_path, tokenizer_list, *args, **kwargs):
        self.model_file = self.get_model_file(model_path)

        self.crf_tagger = pycrfsuite.Tagger()
        self.crf_tagger.open(self.model_file)

        pickle_file = self.get_char2feature_file(model_path)
        with open(pickle_file, 'rb') as fd:
            self.feature_func_list = pickle.load(fd)

        for tokenizer in tokenizer_list:
            tokenizer.assign_from_loader(
                crf_tagger=self.crf_tagger,
                feature_func_list=self.feature_func_list
            ) 
Example #25
Source File: crf.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def tag(self, tokens):
        '''
        Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by
                       - Train a new model using ``train'' function
                       - Use the pre-trained model which is set via ``set_model_file'' function
        :params tokens : list of tokens needed to tag.
        :type tokens : list(str)
        :return : list of tagged tokens.
        :rtype : list (tuple(str,str))
        '''

        return self.tag_sents([tokens])[0] 
Example #26
Source File: CRF.py    From indic_tagger with Apache License 2.0 5 votes vote down vote up
def load_model(self):
		self.tagger = pycrfsuite.Tagger()
		self.tagger.open(self.model_path) 
Example #27
Source File: model.py    From underthesea with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, model_path=None):
        if not model_path:
            model_path = join(dirname(__file__), "wt_crf_2018_09_13.bin")
        estimator = pycrfsuite.Tagger()
        estimator.open(model_path)
        self.estimator = estimator 
Example #28
Source File: crf.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def tag(self, tokens):
        '''
        Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by 
                       - Train a new model using ``train'' function 
                       - Use the pre-trained model which is set via ``set_model_file'' function  
        :params tokens : list of tokens needed to tag. 
        :type tokens : list(str)
        :return : list of tagged tokens. 
        :rtype : list (tuple(str,str)) 
        '''
        
        return self.tag_sents([tokens])[0] 
Example #29
Source File: tagger.py    From reldi-tagger with Apache License 2.0 5 votes vote down vote up
def load_models(lang, dir=None):
    global trie
    global tagger
    global lemmatiser
    if dir != None:
        reldir = dir
    trie = pickle.load(open(os.path.join(reldir, lang + '.marisa'), 'rb'))
    tagger = pycrfsuite.Tagger()
    tagger.open(os.path.join(reldir, lang + '.msd.model'))
    lemmatiser = {'model': pickle.load(open(os.path.join(reldir, lang + '.lexicon.guesser'), 'rb')),
                  'lexicon': pickle.load(open(os.path.join(reldir, lang + '.lexicon'), 'rb'))} 
Example #30
Source File: spotcheck.py    From parserator with MIT License 5 votes vote down vote up
def compareTaggers(model1, model2, string_list, module_name):
    """
    Compare two models. Given a list of strings, prints out tokens & tags
    whenever the two taggers parse a string differently. This is for spot-checking models
    :param tagger1: a .crfsuite filename
    :param tagger2: another .crfsuite filename
    :param string_list: a list of strings to be checked
    :param module_name: name of a parser module
    """
    module = __import__(module_name)

    tagger1 = pycrfsuite.Tagger()
    tagger1.open(module_name+'/'+model1)
    tagger2 = pycrfsuite.Tagger()
    tagger2.open(module_name+'/'+model2)

    count_discrepancies = 0

    for string in string_list:
        tokens = module.tokenize(string)
        if tokens:
            features = module.tokens2features(tokens)

            tags1 = tagger1.tag(features)
            tags2 = tagger2.tag(features)

            if tags1 != tags2:
                count_discrepancies += 1
                print('\n')
                print("%s. %s" %(count_discrepancies, string))
                
                print('-'*75)
                print_spaced('token', model1, model2)
                print('-'*75)
                for token in zip(tokens, tags1, tags2):
                    print_spaced(token[0], token[1], token[2])
    print("\n\n%s of %s strings were labeled differently"%(count_discrepancies, len(string_list)))