#!/usr/bin/env python # -*- coding: utf-8 -*- ## Project: Simple4All - June 2013 - www.simple4all.org ## Contact: Oliver Watts - owatts@staffmail.ed.ac.uk ## Contact: Antti Suni - Antti.Suni@helsinki.fi # # The processors in this module integrate machine learning models # implemented in scikit-learn (http://scikit-learn.org/stable) # import os import numpy as np import pickle from sklearn import tree from sklearn.feature_extraction import DictVectorizer from UtteranceProcessor import SUtteranceProcessor import logging import default.const as c class SKLDecisionTree(SUtteranceProcessor): def __init__(self, processor_name='decision_tree', target_nodes='', output_attribute='', contexts=[], min_samples_leaf=10): self.processor_name = processor_name self.target_nodes = target_nodes self.min_samples_leaf = min_samples_leaf ## TODO Set other tree building defaults here? self.model = None ## For now, use context list inside the recipe: if contexts == []: sys.exit('config for SKLDecisionTree should contain a list of contexts') self.context_list = contexts assert output_attribute != '' self.output_attribute = output_attribute ## Check response exists in context list, and find its index: self.feature_names = [name for (name, xpath) in self.context_list] assert 'response' in self.feature_names reponse_index = self.feature_names.index('response') ## Ensure response is at *start* of context list: response = self.context_list[reponse_index] del self.context_list[reponse_index] self.context_list.insert(0, response) super(SKLDecisionTree, self).__init__() def verify(self, resources): super(SKLDecisionTree, self).verify(resources) self.model_file = os.path.join(self.get_location(), "model.pkl") ## TODO: standard filename in const.py? if os.path.isfile(self.model_file): ## If the model file exists, count it as trained -- get everything from processor dir: f = open(self.model_file, 'rb') [self.x_vectoriser, self.y_vectoriser, self.model] = pickle.load(f) f.close() def process_utterance(self, utt): assert self.model, 'Cannot apply processor %s until its model is trained'%(self.processor_name) for node in utt.xpath(self.target_nodes): input_features = dict(node.get_context_vector(self.context_list)) input_features = self.x_vectoriser.transform(input_features).toarray() ## no need to remove response -- will be ignored if not in x_vectoriser decision = self.model.predict(input_features) ### Skip this -- assume response is numeric: #decision = self.y_vectoriser.inverse_transform(decision) ''' ##PROBLEM: 0 values not put in inverse transformed dict: >>> print yv.inverse_transform(yv.transform([{'response': 3}])) [{'response': 3.0}] >>> print yv.inverse_transform(yv.transform([{'response': 0.0}])) [{}] ''' decision = decision[0] ## prediction is a list of 1 int # prososdy labeling is numeric but quantized for HTS. so 2.0 != 2 for contextual features # does this break other modules? try: if decision == int(decision): decision = int(decision) except: pass node.set(self.output_attribute, unicode(decision)) ## TODO: where is best place to convert to unicode? def do_training(self, speech_corpus, text_corpus): if self.model: ## if already trained... return ## 1) get data: #### [Added dump_features method to Utterance class, use that: ] x_data = [] y_data = [] for utterance in speech_corpus: utt_feats = utterance.dump_features(self.target_nodes, \ self.context_list, return_dict=True) for example in utt_feats: assert 'response' in example,example y_data.append({'response': example['response']}) del example['response'] x_data.append(example) ## Handle categorical features (strings) but to keep numerical ones ## as they are: x_vectoriser = DictVectorizer() x_data = x_vectoriser.fit_transform(x_data).toarray() y_vectoriser = DictVectorizer() y_data = y_vectoriser.fit_transform(y_data).toarray() if False: print x_data print y_data ## 2) train classifier: model = tree.DecisionTreeClassifier(min_samples_leaf=self.min_samples_leaf) model.fit(x_data, y_data) print '\n Trained classifier: ' print model print '\n Trained x vectoriser:' print x_vectoriser print 'Feature names:' print x_vectoriser.get_feature_names() print '\n Trained y vectoriser:' print y_vectoriser print 'Feature names:' print y_vectoriser.get_feature_names() ## 3) Save classifier by pickling: output = open(self.model_file, 'wb') pickle.dump([x_vectoriser, y_vectoriser, model], output) output.close() ## Write ASCII tree representation (which can be plotted): tree.export_graphviz(model, out_file=self.model_file + '.dot', \ feature_names=x_vectoriser.get_feature_names()) self.verify(self.voice_resources) # ## reload -- get self.model etc class SKLDecisionTreePausePredictor(SKLDecisionTree): ## TODO: revise this -- all hardcoded! def process_utterance(self, utt): ## add predictions at token level: super(SKLDecisionTreePausePredictor, self).process_utterance(utt) ## ## assume if there is a waveform attached, we are training, otherwise runtime: is_train_time = ('waveform' in utt.attrib) ## act on predictions by adding silence symbol: for segment in utt.xpath('//segment'): ## if we are at run time and end of sentence, always add silence token_text = segment.xpath('ancestor::token/attribute::text') end_of_sentence = (token_text == '_END_') if (not is_train_time) and end_of_sentence: segment.attrib['pronunciation'] = 'sil' elif segment.get('pronunciation') in [c.POSS_PAUSE, c.PROB_PAUSE]: silence_predicted = '0' for ancestor in segment.iterancestors(): if ancestor.has_attribute(self.output_attribute): silence_predicted = ancestor.get(self.output_attribute) if silence_predicted=='1': segment.attrib['pronunciation'] = 'sil' else: segment.getparent().remove(segment) ## remove the segment altogether