import os import re import numpy as np import scipy.io import theano import theano.tensor as T import codecs import cPickle from utils import shared, set_values, get_name from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, forward from nn import LSTM; #from nn import LSTM_normal as LSTM; from nn import AttentionLayer; from optimization import Optimization def loadPreEmbFeatures(fName,feature_to_id,weights,lower=False): #{{{ def f(x): return x.lower() if lower else x #to lower feature_to_id_=feature_to_id; if lower: feature_to_id_lower={}; for elem in feature_to_id.items(): feature_to_id_lower[elem[0].lower()]=elem[1]; feature_to_id_=feature_to_id_lower; feature_dim=weights.shape[1]; invalid_count=0; valid_count=0; for line in codecs.open(fName,'r','utf-8'): line=line.rstrip().split(); if len(line) == feature_dim+1 and line[0] in feature_to_id_: weights[feature_to_id_[line[0]]]=np.array( [float(x) for x in line[1:]] ).astype(theano.config.floatX) valid_count+=1; else: invalid_count+=1; print "when loading %s ,%d Invalid line,%d valid line" %(fName,invalid_count,valid_count); #}}} class Model(object): """ Network architecture. """ def __init__(self, parameters=None, models_path=None, model_path=None,Training=False): #{{{ """ Initialize the model. We either provide the parameters and a path where we store the models, or the location of a trained model. """ if Training: #{{{ assert parameters and models_path # Create a name based on the parameters self.parameters = parameters self.name = get_name(parameters) # Model location if model_path is None: model_path = os.path.join(models_path, self.name) self.model_path = model_path self.parameters_path = os.path.join(model_path, 'parameters.pkl') self.mappings_path = os.path.join(model_path, 'mappings.pkl') # Create directory for the model if it does not exist if not os.path.exists(self.model_path): os.makedirs(self.model_path) # Save the parameters to disk with open(self.parameters_path, 'wb') as f: cPickle.dump(parameters, f) #}}} else: #{{{ # Model location self.model_path = model_path self.parameters_path = os.path.join(model_path, 'parameters.pkl') self.mappings_path = os.path.join(model_path, 'mappings.pkl') # Create directory for the model if it does not exist if not os.path.exists(self.model_path): os.makedirs(self.model_path) # Save the parameters to disk with open(self.parameters_path, 'rb') as f: self.parameters=cPickle.load(f); self.reload_mappings(); self.components = {} #}}} #}}} def save_mappings(self, id_to_word, id_to_char, id_to_tag): #{{{ """ We need to save the mappings if we want to use the model later. """ self.id_to_word = id_to_word self.id_to_char = id_to_char self.id_to_tag = id_to_tag with open(self.mappings_path, 'wb') as f: mappings = { 'id_to_word': self.id_to_word, 'id_to_char': self.id_to_char, 'id_to_tag': self.id_to_tag, } cPickle.dump(mappings, f) #}}} def reload_mappings(self): #{{{ """ Load mappings from disk. """ with open(self.mappings_path, 'rb') as f: mappings = cPickle.load(f) self.id_to_word = mappings['id_to_word'] self.id_to_char = mappings['id_to_char'] self.id_to_tag = mappings['id_to_tag'] #}}} def add_component(self, param): """ Add a new parameter to the network. """ if param.name in self.components: raise Exception('The network already has a parameter "%s"!' % param.name) self.components[param.name] = param def modelScore(self,tag_ids,scores,s_len): #{{{ """ ATTENTATION THIS FUNCTION IS SYMBOL PROGRAMMING this function is to return the score of our model at a fixed sentence label @param: scores: the scores matrix ,the output of our model tag: a numpy array, which represent one sentence label sent_lens: a scalar number, the length of sentence. because our sentence label will be expand to max sentence length, so we will use this to get the original sentence label. @return: a scalar number ,the score; """ #{{{ n_tags=self.output_dim; transitions=self.transitions; #score from tags_scores real_path_score = scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() #to prevent T.exp(real_path_score) to be inf #return real_path_score; return real_path_score/s_len; #}}} #}}} def save(self): #{{{ """ Write components values to disk. """ for name, param in self.components.items(): param_path = os.path.join(self.model_path, "%s.mat" % name) if hasattr(param, 'params'): param_values = {p.name: p.get_value() for p in param.params} else: param_values = {name: param.get_value()} scipy.io.savemat(param_path, param_values) #}}} def reload(self,features=None): #{{{ """ Load components values from disk. """ featureLayerNameMap=['pos_layer','lemma_layer', 'chunk_layer','dic_layer']; for name, param in self.components.items(): #when feature is use to attended and not lstm-input, #we will not reload the param if features is not None and name in featureLayerNameMap: featuresName=name[:name.find('_')]; if features[featuresName]['attended']==1 and \ features[featuresName]['lstm-input']==0: continue; param_path = os.path.join(self.model_path, "%s.mat" % name) param_values = scipy.io.loadmat(param_path) if hasattr(param, 'params'): for p in param.params: set_values(p.name, p, param_values[p.name]) else: set_values(name, param, param_values[name]) #}}} def build4(self,parameters): #{{{ """ Build the network. """ #some parameters dropout=parameters['dropout'] ; char_dim=parameters['char_dim']; char_lstm_dim=parameters['char_lstm_dim']; char_bidirect=parameters['char_bidirect']; word_dim=parameters['word_dim']; word_lstm_dim=parameters['word_lstm_dim']; word_bidirect=parameters['word_bidirect']; lr_method=parameters['lr_method']; pre_emb=parameters['pre_emb']; crf=parameters['crf']; cap_dim=parameters['cap_dim']; training=parameters['training']; features=parameters['features']; useAttend=parameters['useAttend']; if useAttend: reloadParam=parameters['loading']; else: reloadParam=None; if reloadParam is not None: reloadPath=parameters['loading_path']; sentencesLevelLoss=parameters['sentencesLevelLoss']; # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) self.output_dim = len(self.id_to_tag); self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions') # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') wordTrue_ids=T.ivector(name='wordTrue_ids'); char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') docLen=T.ivector(name='docLen'); tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') #some features if features is not None and features['lemma']['isUsed']: lemma_ids=T.ivector(name='lemma_ids'); if features is not None and features['pos']['isUsed']: pos_ids=T.ivector(name='pos_ids'); if features is not None and features['chunk']['isUsed']: chunk_ids=T.ivector(name='chunk_ids'); if features is not None and features['dic']['isUsed']: dic_ids=T.ivector(name='dic_ids'); # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # Word inputs #{{{ if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) wordTrue_input=word_layer.link(wordTrue_ids); inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros )#}}} # Chars inputs #{{{ if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_output=T.concatenate([char_for_output,char_rev_output],axis=-1); inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim #}}} # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) #add feature #{{{ if features is not None and features['lemma']['isUsed']: lemma_layer=EmbeddingLayer(features['lemma']['num'], features['lemma']['dim'], name='lemma_layer'); if features['lemma']['pre_emb'] is not "": new_weights=lemma_layer.embeddings.get_value(); loadPreEmbFeatures(features['lemma']['pre_emb'], features['feature_to_id_map']['lemma'], new_weights, lower=True); lemma_layer.embeddings.set_value(new_weights); lemma_output=lemma_layer.link(lemma_ids); if features['lemma']['lstm-input']: input_dim+=features['lemma']['dim']; inputs.append(lemma_output); if features is not None and features['pos']['isUsed']: pos_layer=EmbeddingLayer(features['pos']['num'], features['pos']['dim'], name='pos_layer'); if features['pos']['pre_emb'] is not "": new_weights=pos_layer.embeddings.get_value(); loadPreEmbFeatures(features['pos']['pre_emb'], features['feature_to_id_map']['pos'], new_weights); pos_layer.embeddings.set_value(new_weights); pos_output=pos_layer.link(pos_ids); if features['pos']['lstm-input']: input_dim+=features['pos']['dim']; inputs.append(pos_output); if features is not None and features['chunk']['isUsed']: chunk_layer=EmbeddingLayer(features['chunk']['num'], features['chunk']['dim'], name='chunk_layer'); chunk_output=chunk_layer.link(chunk_ids); if features['chunk']['lstm-input']: input_dim+=features['chunk']['dim']; inputs.append(chunk_output) if features is not None and features['dic']['isUsed']: dic_layer=EmbeddingLayer(features['dic']['num'], features['dic']['dim'], name='dic_layer'); dic_output=dic_layer.link(dic_ids); if features['dic']['lstm-input']: input_dim+=features['dic']['dim']; inputs.append(dic_output); #}}} # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train,input_test); # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') if sentencesLevelLoss: def sentLSTM(i,output,input,lenVec): #{{{ Len=lenVec[i]; accLen=lenVec[:i].sum(); currentInput=input[accLen:accLen+Len]; word_lstm_for.link(currentInput); word_lstm_rev.link(currentInput[::-1,:]); wordForOutput=word_lstm_for.h; wordRevOutput=word_lstm_rev.h[::-1,:]; finalOutput=T.concatenate( [wordForOutput,wordRevOutput],axis=-1 ) output=T.set_subtensor(output[accLen:accLen+Len], finalOutput); return output; #}}} result,update=theano.scan(fn=sentLSTM, outputs_info=T.zeros((inputs.shape[0],word_lstm_dim*2),dtype='float32'), sequences=[T.arange(docLen.shape[0])], non_sequences=[inputs,docLen]); word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_for_c=word_lstm_for.c; word_rev_output = word_lstm_rev.h[::-1, :] word_rev_c=word_lstm_rev.c[::-1,:]; final_c=T.concatenate( [word_for_c,word_rev_c], axis=-1 ) final_output=result[-1] else : word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_for_c=word_lstm_for.c; word_rev_output = word_lstm_rev.h[::-1, :] word_rev_c=word_lstm_rev.c[::-1,:]; final_output = T.concatenate( [word_for_output, word_rev_output], axis=-1 ) final_c=T.concatenate( [word_for_c,word_rev_c], axis=-1 ) if useAttend: #attention layer attended=[]; attendedDim=0; if features is not None and features['word']['attended']: attended.append(wordTrue_input); attendedDim+=word_dim; if features is not None and features['char']['attended']: attended.append(char_output); attendedDim+=char_lstm_dim*2; if features is not None and features['lemma']['attended']: attended.append(lemma_output); attendedDim+=features['lemma']['dim']; if features is not None and features['pos']['attended']: attended.append(pos_output); attendedDim+=features['pos']['dim']; if features is not None and features['chunk']['attended']: attended.append(chunk_output); attendedDim+=features['chunk']['dim']; if features is not None and features['dic']['attended']: attended.append(dic_output); attendedDim+=features['dic']['dim']; attention_layer=AttentionLayer(attended_dim=attendedDim, state_dim=attendedDim, #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2, # state_dim=word_lstm_dim*2, source_dim=word_lstm_dim*2, scoreFunName=parameters['attenScoreFun'], name='attention_layer'); if len(attended)>1: attendedInput=T.concatenate(attended,axis=-1); else: attendedInput=attended[0]; final_output=attention_layer.link(attendedInput,attendedInput,final_output); #using lstm_state to compute attention #final_output=attention_layer.link(final_output,final_c,final_output); self.energy=attention_layer.energy; else: final_output=final_output; tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: if sentencesLevelLoss: #calcuate loss according to sentence instead of docLen def sentLoss(i,scores,trueIds,transitions,lenVec): #{{{ Len=lenVec[i]; accLen=lenVec[:i].sum(); currentTagsScores=scores[accLen:accLen+Len]; currentIds=trueIds[accLen:accLen+Len]; real_path_score = currentTagsScores[T.arange(Len), currentIds].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags],currentIds], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(Len )], padded_tags_ids[T.arange(Len ) + 1] ].sum() all_paths_scores = forward(currentTagsScores,transitions) cost = - (real_path_score - all_paths_scores) return cost; #}}} result,update=theano.scan(fn=sentLoss, outputs_info=None, sequences=[T.arange(docLen.shape[0])], non_sequences=[tags_scores,tag_ids,self.transitions,docLen]) cost=result.sum(); else: real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) real_path_score += self.transitions[ padded_tags_ids[T.arange(s_len )], padded_tags_ids[T.arange(s_len ) + 1] ].sum() all_paths_scores = forward(tags_scores, self.transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(self.transitions) params.append(self.transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) #add feature layer if features is not None and features['lemma']['isUsed']: self.add_component(lemma_layer); params.extend(lemma_layer.params); if features is not None and features['pos']['isUsed']: self.add_component(pos_layer); params.extend(pos_layer.params); if features is not None and features['chunk']['isUsed']: self.add_component(chunk_layer); params.extend(chunk_layer.params); if features is not None and features['dic']['isUsed']: self.add_component(dic_layer); params.extend(dic_layer.params); if useAttend and reloadParam: #reload pre-train params model_path=self.model_path; self.model_path=reloadPath; print "loading:",self.model_path; self.reload(features); self.model_path=model_path; if useAttend: #add attention_layer self.add_component(attention_layer); params.extend(attention_layer.params); # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) if useAttend: eval_inputs.append(wordTrue_ids); if sentencesLevelLoss: eval_inputs.append(docLen); #add feature input if features is not None and features['lemma']['isUsed']: eval_inputs.append(lemma_ids); if features is not None and features['pos']['isUsed']: eval_inputs.append(pos_ids); if features is not None and features['chunk']['isUsed']: eval_inputs.append(chunk_ids); if features is not None and features['dic']['isUsed']: eval_inputs.append(dic_ids); train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: #constraints if useAttend: self.constraints=attention_layer.constraints; else: self.constraints={}; from keras import optimizers ; self.optimizer=optimizers.SGD(lr=0.001,momentum=0.9, decay=0.,nesterov=True,clipvalue=5); self.optimizer=optimizers.RMSprop(); #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01) updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params,constraints=self.constraints, **lr_method_parameters) #updates = self.optimizer.get_updates(params,self.constraints,cost); f_train_outputs=[cost]; if useAttend: f_train_outputs.append(self.energy); f_train = theano.function( inputs=train_inputs, outputs=f_train_outputs, updates=updates, on_unused_input='ignore', givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) f_test = theano.function( inputs=train_inputs, outputs=cost, on_unused_input='ignore', givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) self.f_test=f_test; else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: if sentencesLevelLoss: def sentVitebe(i,predictTag,scores,transitions,lenVec): #{{{ Len=lenVec[i]; accLen=lenVec[:i].sum(); currentTagsScores=scores[accLen:accLen+Len]; currentPredictIds=forward(currentTagsScores, transitions,viterbi=True, return_alpha=False, return_best_sequence=True) ; predictTag=T.set_subtensor(predictTag[accLen:accLen+Len],currentPredictIds); return predictTag; #}}} predictTag,update=theano.scan(fn=sentVitebe, outputs_info=T.zeros((tags_scores.shape[0],),dtype='int32'), sequences=[T.arange(docLen.shape[0])], non_sequences=[tags_scores,self.transitions,docLen]); predictTag=predictTag[-1]; else: predictTag=forward(tags_scores, self.transitions, viterbi=True,return_alpha=False, return_best_sequence=True) f_eval = theano.function( inputs=eval_inputs, outputs=predictTag, on_unused_input='ignore', givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) #f_AttenVisual=theano.function( # inputs=eval_inputs, # outputs=[predictTag,self.energy], # on_unused_input='ignore', # givens=({is_train: np.cast['int32'](0)} if dropout else {}) # ) #self.f_AttenVisual=f_AttenVisual; return f_train, f_eval; #}}} def build(self,parameters): #{{{ """ Build the network. """ #some parameters dropout=parameters['dropout'] ; char_dim=parameters['char_dim']; char_lstm_dim=parameters['char_lstm_dim']; char_bidirect=parameters['char_bidirect']; word_dim=parameters['word_dim']; word_lstm_dim=parameters['word_lstm_dim']; word_bidirect=parameters['word_bidirect']; lr_method=parameters['lr_method']; pre_emb=parameters['pre_emb']; crf=parameters['crf']; cap_dim=parameters['cap_dim']; training=parameters['training']; features=parameters['features']; # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) self.output_dim = len(self.id_to_tag); self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions') # Number of capitalization features if cap_dim: n_cap = 4 if features is not None and features['lemma']['isUsed']: lemma_ids=T.ivector(name='lemma_ids'); if features is not None and features['pos']['isUsed']: pos_ids=T.ivector(name='pos_ids'); if features is not None and features['chunk']['isUsed']: chunk_ids=T.ivector(name='chunk_ids'); if features is not None and features['NER']['isUsed']: dic_ids=T.ivector(name='dic_ids'); # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # Word inputs #{{{ if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) #for attention inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros )#}}} # Chars inputs #{{{ if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim #}}} # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: #all_paths_scores = forward(observations, self.transitions) #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores) #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ; #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5); #cost=-error; #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2) real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) real_path_score += self.transitions[ padded_tags_ids[T.arange(s_len )], padded_tags_ids[T.arange(s_len ) + 1] ].sum() all_paths_scores = forward(tags_scores, self.transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(self.transitions) params.append(self.transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: import optimizers ; self.optimizer=optimizers.RMSprop(lr=0.001); updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) self.constraints={}; #updates = self.optimizer.get_updates(params,self.constraints,cost); f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) #for debug #f_Debug = theano.function( # inputs=train_inputs, # outputs=cost, # updates=self.update, # givens=({is_train: np.cast['int32'](1)} if dropout else {}) #) #debug end else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(tags_scores, self.transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) return f_train, f_eval #}}}