python source code of NER

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2019/8/4 1:50
# @Author: jtyoui@qq.com
from keras.models import Sequential, load_model
from keras.layers import Embedding, Bidirectional, LSTM, SpatialDropout1D, TimeDistributed, Dense
from keras_contrib.layers import CRF, crf
from keras.optimizers import Adam
from keras.utils.np_utils import np
from keras.callbacks import RemoteMonitor
from jtyoui.neuralNetwork.kerase.AnalyticalData import *
from keras.activations import relu

tag = {'O': 0, 'B-a': 1, 'I-a': 2, 'B-b': 3, 'I-b': 4, 'B-c': 5, 'I-c': 6}
dirs = r'C:\Users\Xiaoi\Desktop\datagrand\datagrand\\'
vocab_path_model = dirs + 'data/vocab.pkl'
train_vocab_path = dirs + 'data/corpus.txt'
train_path = dirs + 'data/train.txt'
test_path = dirs + 'data/test.txt'
temp_path = dirs + 'data/crf.txt'
result_path = dirs + 'data/result.txt'
model_path = dirs + 'data/model-ner.h5'
cxl_model = dirs + r'data/cxl.pkl'

# analysis_vocab(train_vocab_path, vocab_path_model)  # 训练vocab，只需要训练一次就够了
vocab = load_vocab(vocab_path_model)
length = analysis_rational_len(train_path, percent=0.93)


def load_embedding():
    with open(cxl_model, 'rb') as fp:
        cxl = pickle.load(fp)
    embedding_matrix = np.zeros((len(vocab), 300))
    for k, v in vocab.items():
        if cxl.get(k) is not None:
            embedding_matrix[v] = cxl[k]
    return embedding_matrix


def train_model():
    if cxl_model:
        embedding_matrix = load_embedding()
    else:
        embedding_matrix = {}
    train, label = vocab_train_label(train_path, vocab=vocab, tags=tag, max_chunk_length=length)
    n = np.array(label, dtype=np.float)
    labels = n.reshape((n.shape[0], n.shape[1], 1))
    model = Sequential([
        Embedding(input_dim=len(vocab), output_dim=300, mask_zero=True, input_length=length, weights=[embedding_matrix],
                  trainable=False),
        SpatialDropout1D(0.2),
        Bidirectional(layer=LSTM(units=150, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
        TimeDistributed(Dense(len(tag), activation=relu)),
    ])
    crf_ = CRF(units=len(tag), sparse_target=True)
    model.add(crf_)
    model.compile(optimizer=Adam(), loss=crf_.loss_function, metrics=[crf_.accuracy])
    model.fit(x=np.array(train), y=labels, batch_size=16, epochs=4, callbacks=[RemoteMonitor()])
    model.save(model_path)


def predict_model():
    x_test, original = vocab_test(test_path, vocab, length)
    ws = open(temp_path, mode='w', newline='\n')
    tags = dict(zip(tag.values(), tag.keys()))
    custom_objects = {'CRF': CRF, 'crf_loss': crf.crf_loss, 'crf_viterbi_accuracy': crf.crf_viterbi_accuracy}
    model = load_model(model_path, custom_objects=custom_objects)
    for question, tests in zip(original, x_test):
        raw = model.predict([[tests]])[0][-len(question):]
        result = [np.argmax(row) for row in raw]
        answer = tuple(map(lambda x: tags[x], result))
        ma = map(lambda x: x[0] + '\t' + x[1] + '\n', zip(question, answer))
        ws.writelines(ma)
        ws.write('\n')
    ws.flush()
    ws.close()


if __name__ == '__main__':
    train_model()
    predict_model()
    restore_format(crf_path=temp_path, standard_path=result_path)