# system library import numpy as np import json # user-library from utils import load_data from utils import util # third-party library from sklearn.utils import shuffle from sklearn import preprocessing from sklearn import cross_validation from sklearn.cluster import KMeans from sklearn.mixture import GMM class ClassificationBase(object): def __init__(self, isTrain, isOutlierRemoval=0, isNN=0): # indicate it is train data or not self.isTrain = isTrain self.isOutlierRemoval = isOutlierRemoval self.isNN = isNN # indicate it is neural network # route prefix self.routes = ["BCN_BUD", # route 1 "BUD_BCN", # route 2 "CRL_OTP", # route 3 "MLH_SKP", # route 4 "MMX_SKP", # route 5 "OTP_CRL", # route 6 "SKP_MLH", # route 7 "SKP_MMX"] # route 8 """ For the large data set, these datas are from the function in util """ # self.randomPrices_train = [62.404880201765373, # 64.389689785624228, # 58.365558867362232, # 53.396562847608401, # 57.555263421976903, # 56.881071560993696, # 80.610587319243578, # 81.273256281407001] # # self.randomPrices_test = [76.867777777777732, # 79.859555555555559, # 48.753285024154536, # 74.848996051889415, # 60.510405405405386, # 46.05193236714976, # 78.827685279187833, # 61.32490540540541] # # self.minPrices_train = [38.291886792452836, # 34.322716981132082, # 40.323333333333174, # 32.409354838709689, # 34.680000000000007, # 30.694444444444443, # 45.243290322580577, # 46.73057142857138] # # self.minPrices_test = [38.073333333333309, # 38.261333333333333, # 24.047971014492759, # 44.908032786885208, # 33.905806451612911, # 20.945652173913043, # 50.942655737704911, # 35.778774193548394] # # self.maxPrices_train = [126.02773584905646, # 150.12649056603769, # 119.43444444444427, # 109.10290322580629, # 125.22857142857143, # 171.72222222222223, # 165.93038709677407, # 178.90200000000002] # # self.maxPrices_test = [149.48999999999984, # 155.91466666666665, # 146.22188405797084, # 144.08836065573755, # 134.68645161290314, # 156.27173913043478, # 151.77216393442615, # 154.14329032258058] """ For the smalle data set - these datas are from the function in util """ # random price list self.randomPrices_train = [68.4391315136, 67.4260645161, 93.2808545727, 77.4751720047, 75.0340018399, 73.9964736451, 105.280932384, 97.1720369004] self.randomPrices_test = [55.4820634921, 57.8067301587, 23.152037037, 33.3727319588, 35.3032044199, 41.1180555556, 56.3433402062, 60.2546519337] # minimum price list self.minPrices_train = [44.4344444444, 38.9605925926, 68.6566666667, 49.6566666667, 48.2691891892, 47.0833333333, 68.982, 63.1279459459] self.minPrices_test = [32.370952381, 29.3775238095, 11.3788888889, 16.5284615385, 18.6184615385, 14.6111111111, 21.5127692308, 25.8050769231] # maximum price list self.maxPrices_train = [115.915925926, 126.782814815, 144.212222222, 129.656666667, 141.252972973, 149.972222222, 174.402, 160.91172973 ] self.maxPrices_test = [126.656666667, 168.95847619, 93.6011111111, 90.5669230769, 101.233846154, 198.361111111, 154.505076923, 208.020461538] # for currency change self.currency = [1, # route 1 - Euro 0.0032, # route 2 - Hungarian Forint 1, # route 3 - Euro 1, # route 4 - Euro 0.12, # route 5 - Swedish Krona 0.25, # route 6 - Romanian Leu 0.018, # route 7 - Macedonian Denar 0.018 # route 8 - Macedonian Denar ] # feature 0~7: flight number dummy variables # feature 8: departure date; feature 9: observed date state; # feature 10: minimum price; feature 11: maximum price # output: prediction(buy or wait); output_price: price # load training datasets if isOutlierRemoval: self.X_train = np.load('inputClf_GMMOutlierRemoval/X_train.npy') self.y_train = np.load('inputClf_GMMOutlierRemoval/y_train.npy') self.y_train_price = np.load('inputClf_GMMOutlierRemoval/y_train_price.npy') else: self.X_train = np.load('inputClf_small/X_train.npy') self.y_train = np.load('inputClf_small/y_train.npy') self.y_train_price = np.load('inputClf_small/y_train_price.npy') # deal with unbalanced data #self.X_train, self.y_train = self.dealingUnbalancedData(self.X_train, self.y_train) # load test datasets if isTrain: self.X_test = np.load('inputClf_small/X_train.npy') self.y_test = np.load('inputClf_small/y_train.npy') self.y_test_price = np.load('inputClf_small/y_train_price.npy') self.y_pred = np.empty(shape=(self.y_test.shape[0],1)) # choose the dates whose departureDate-queryDate gaps is larger than 20 self.y_test = self.y_test[np.where(self.X_test[:, 8]>20)[0], :] self.y_test_price = self.y_test_price[np.where(self.X_test[:, 8]>20)[0], :] self.y_pred = self.y_pred[np.where(self.X_test[:, 8]>20)[0], :] self.X_test = self.X_test[np.where(self.X_test[:, 8]>20)[0], :] """ # split train and validation set tmpMatrix = np.concatenate((self.X_test, self.y_test_price, self.y_pred), axis=1) trainMatrix, testMatrix, self.y_train, self.y_test = cross_validation.train_test_split( tmpMatrix, self.y_test, test_size=0.4, random_state=0) self.X_train = trainMatrix[:, 0:12] self.y_train_price = trainMatrix[:, 12] self.y_train_price = self.y_train_price.reshape((self.y_train_price.shape[0], 1)) #self.y_pred = trainMatrix[:, 13] self.X_test = testMatrix[:, 0:12] self.y_test_price = testMatrix[:, 12] self.y_test_price = self.y_test_price.reshape((self.y_test_price.shape[0], 1)) self.y_pred = testMatrix[:, 13] """ else: self.X_test = np.load('inputClf_small/X_test.npy') self.y_test = np.load('inputClf_small/y_test.npy') self.y_test_price = np.load('inputClf_small/y_test_price.npy') self.y_pred = np.empty(shape=(self.y_test.shape[0],1)) def priceNormalize(self): """ Different routes have different units for the price, normalize it as Euro. :return: NA """ # normalize feature 10, feature 11, feature 13 # feature 0~7: flight number dummy variables # feature 8: departure date; feature 9: observed date state; # feature 10: minimum price; feature 11: maximum price # fearure 12: prediction(buy or wait); feature 13: price evalMatrix_train = np.concatenate((self.X_train, self.y_train, self.y_train_price), axis=1) evalMatrix_test = np.concatenate((self.X_test, self.y_test, self.y_test_price), axis=1) matrixTrain = np.empty(shape=(0, evalMatrix_train.shape[1])) matrixTest = np.empty(shape=(0, evalMatrix_train.shape[1])) for i in range(len(self.routes)): evalMatrix = evalMatrix_train[np.where(evalMatrix_train[:, i]==1)[0], :] evalMatrix[:, 10] *= self.currency[i] evalMatrix[:, 11] *= self.currency[i] evalMatrix[:, 13] *= self.currency[i] matrixTrain = np.concatenate((matrixTrain, evalMatrix), axis=0) evalMatrix = evalMatrix_test[np.where(evalMatrix_test[:, i]==1)[0], :] evalMatrix[:, 10] *= self.currency[i] evalMatrix[:, 11] *= self.currency[i] evalMatrix[:, 13] *= self.currency[i] matrixTest = np.concatenate((matrixTest, evalMatrix), axis=0) self.X_train = matrixTrain[:, 0:12] self.y_train = matrixTrain[:, 12] self.y_train_price = matrixTrain[:, 13] self.X_test = matrixTest[:, 0:12] self.y_test = matrixTest[:, 12] self.y_test_price = matrixTest[:, 13] self.y_train = self.y_train.reshape((self.y_train.shape[0], 1)) self.y_train_price = self.y_train_price.reshape((self.y_train_price.shape[0], 1)) self.y_test = self.y_test.reshape((self.y_test.shape[0], 1)) self.y_test_price = self.y_test_price.reshape((self.y_test_price.shape[0], 1)) #self.X_train = np.concatenate((self.X_train, self.y_train_price), axis=1) #self.X_test = np.concatenate((self.X_test, self.y_test_price), axis=1) np.save('inputClf/X_train', self.X_train) np.save('inputClf/y_train', self.y_train) np.save('inputClf/y_train_price', self.y_train_price) np.save('inputClf/X_test', self.X_test) np.save('inputClf/y_test', self.y_test) np.save('inputClf/y_test_price', self.y_test_price) def Standardization(self): scaled = preprocessing.scale(self.X_train[:, 10:12]) self.X_train[:, 10:12] = scaled scaled = preprocessing.scale(self.X_test[:, 10:12]) self.X_test[:, 10:12] = scaled def load(self, dataset="large data set"): """ Load the data for classification :param dataset: dataset :return: X_train, y_train, X_test, y_test """ isOneOptimalState = False # Construct the input data d = 12 X_train = np.empty(shape=(0, d)) y_train = np.empty(shape=(0,1)) y_train_price = np.empty(shape=(0,1)) X_test = np.empty(shape=(0,d)) y_test = np.empty(shape=(0,1)) y_test_price = np.empty(shape=(0,1)) for filePrefix in self.routes: datas = load_data.load_data_with_prefix_and_dataset(filePrefix, dataset) for data in datas: print "Construct route {}, State {}, departureDate {}...".format(filePrefix, data["State"], data["Date"]) x_i = [] # feature 1: flight number -> dummy variables for i in range(len(self.routes)): """ !!!need to change! """ if i == self.routes.index(filePrefix): x_i.append(1) else: x_i.append(0) # feature 2: departure date interval from "20151109", because the first observed date is 20151109 departureDate = data["Date"] """ !!!maybe need to change the first observed date """ departureDateGap = util.days_between(departureDate, "20151109") x_i.append(departureDateGap) # feature 3: observed days before departure date state = data["State"] x_i.append(state) # feature 4: minimum price before the observed date minimumPreviousPrice = self.getMinimumPreviousPrice(data["Date"], state, datas) x_i.append(minimumPreviousPrice) # feature 5: maximum price before the observed date maximumPreviousPrice = self.getMaximumPreviousPrice(data["Date"], state, datas) x_i.append(maximumPreviousPrice) # output y_i = [0] specificDatas = [] specificDatas = [data2 for data2 in datas if data2["Date"]==departureDate] if isOneOptimalState: # Method 1: only 1 entry is buy optimalState = load_data.getOptimalState(specificDatas) if data["State"] == optimalState: y_i = [1] else: # Method 2: multiple entries can be buy minPrice = load_data.getMinimumPrice(specificDatas) if util.getPrice(data["MinimumPrice"]) == minPrice: y_i = [1] # keep price info y_price = [util.getPrice(data["MinimumPrice"])] if int(departureDate) < 20160115: # choose date before "20160115" as training data X_train = np.concatenate((X_train, [x_i]), axis=0) y_train = np.concatenate((y_train, [y_i]), axis=0) y_train_price = np.concatenate((y_train_price, [y_price]), axis=0) elif int(departureDate) < 20160220: # choose date before "20160220" as test data X_test = np.concatenate((X_test, [x_i]), axis=0) y_test = np.concatenate((y_test, [y_i]), axis=0) y_test_price = np.concatenate((y_test_price, [y_price]), axis=0) else: pass if isOneOptimalState: np.save('inputNN_1Buy/X_train', X_train) np.save('inputNN_1Buy/y_train', y_train) np.save('inputNN_1Buy/y_train_price', y_train_price) np.save('inputNN_1Buy/X_test', X_test) np.save('inputNN_1Buy/y_test', y_test) np.save('inputNN_1Buy/y_test_price', y_test_price) else: np.save('inputNN_NBuy/X_train', X_train) np.save('inputNN_NBuy/y_train', y_train) np.save('inputNN_NBuy/y_train_price', y_train_price) np.save('inputNN_NBuy/X_test', X_test) np.save('inputNN_NBuy/y_test', y_test) np.save('inputNN_NBuy/y_test_price', y_test_price) return X_train, y_train, X_test, y_test def getMinimumPreviousPrice(self, departureDate, state, datas): """ Get the minimum previous price, corresponding to the departure date and the observed date :param departureDate: departure date :param state: observed date :param datas: datasets :return: minimum previous price """ specificDatas = [] specificDatas = [data for data in datas if data["Date"]==departureDate] minimumPreviousPrice = util.getPrice(specificDatas[0]["MinimumPrice"]) for data in specificDatas: if util.getPrice(data["MinimumPrice"]) < minimumPreviousPrice and data["State"]>=state: minimumPreviousPrice = util.getPrice(data["MinimumPrice"]) return minimumPreviousPrice def getMaximumPreviousPrice(self, departureDate, state, datas): """ Get the maximum previous price, corresponding to the departure date and the observed date :param departureDate: departure date :param state: observed date :param datas: datasets :return: maximum previous price """ specificDatas = [] specificDatas = [data for data in datas if data["Date"]==departureDate] maximumPreviousPrice = util.getPrice(specificDatas[0]["MinimumPrice"]) for data in specificDatas: if util.getPrice(data["MinimumPrice"]) > maximumPreviousPrice and data["State"]>=state: maximumPreviousPrice = util.getPrice(data["MinimumPrice"]) return maximumPreviousPrice def dealingUnbalancedData(self): """ Dealing with unbalanced training data """ len0 = np.count_nonzero(1-self.y_train) len1 = np.count_nonzero(self.y_train) dup = int(len0/len1) dup = int(dup * 1.5) # change this value, make it more possible to predict buy. X1 = self.X_train[np.where(self.y_train==1)[0], :] y1 = self.y_train[np.where(self.y_train==1)[0], :] y2 = self.y_train_price[np.where(self.y_train==1)[0], :] X1 = np.tile(X1, (dup-1,1)) y1 = np.tile(y1, (dup-1,1)) y2 = np.tile(y2, (dup-1,1)) self.X_train = np.concatenate((self.X_train, X1), axis=0) self.y_train = np.concatenate((self.y_train, y1), axis=0) self.y_train_price = np.concatenate((self.y_train_price, y2), axis=0) # shuffle train data self.X_train, self.y_train, self.y_train_price = shuffle(self.X_train, self.y_train, self.y_train_price, random_state=42) def visualizePrediction(self, filePrefix): """ Visualize the prediction buy entries for every departure date, for each route :param filePrefix: route prefix :return: NA """ # route index flightNum = self.routes.index(filePrefix) # concatenate the buy or wait info to get the total datas y_pred = self.y_pred.reshape((self.y_pred.shape[0],1)) X_test = np.concatenate((self.X_test, self.y_test, y_pred, self.y_test_price), axis=1) # choose one route datas X_test = X_test[np.where(X_test[:, flightNum]==1)[0], :] # remove dummy variables # feature 0: departure date; feature 1: observed date state # feature 2: minimum price; feature 3: maximum price # feature 4: output(buy or wait); feature 5: prediction # feature 7: current price X_test = X_test[:, 8:15] # group by the feature: departure date departureDates_test = np.unique(X_test[:, 0]) # get the final datas, the observed data state should be from large to small(i.e. for time series) length_test = [] for departureDate in departureDates_test: indexs = np.where(X_test[:, 0]==departureDate)[0] datas = X_test[indexs, :] length_test.append(len(datas)) print departureDate print datas def visualizeTrainData(self, filePrefix): """ Visualize the train buy entries for every departure date, for each route :param filePrefix: route prefix :return: NA """ # route index flightNum = self.routes.index(filePrefix) # concatenate the buy or wait info to get the total datas y_train = self.y_train.reshape((self.y_train.shape[0],1)) y_train_price = self.y_train_price.reshape((self.y_train_price.shape[0],1)) X_train = np.concatenate((self.X_train, y_train, y_train_price), axis=1) # choose one route datas X_train = X_train[np.where(X_train[:, flightNum]==1)[0], :] # remove dummy variables # feature 0: departure date; feature 1: observed date state # feature 2: minimum price; feature 3: maximum price # feature 4: prediction(buy or wait). X_train = X_train[:, 8:14] # group by the feature: departure date departureDates_train = np.unique(X_train[:, 0]) # get the final datas, the observed data state should be from large to small(i.e. for time series) length_test = [] for departureDate in departureDates_train: indexs = np.where(X_train[:, 0]==departureDate)[0] datas = X_train[indexs, :] length_test.append(len(datas)) print departureDate print datas def evaluateOneRoute(self, filePrefix="BCN_BUD"): """ Evaluate one route for one time :param filePrefix: route :return: average price """ if self.isNN: # if it is nn, then run multiple times self.training() self.predict() #X_test, y_pred = self.predict() X_test = self.X_test #y_pred = self.y_pred #y_pred = y_pred.reshape((y_pred.shape[0], 1)) y_pred = self.y_pred.reshape((self.y_pred.shape[0], 1)) y_test_price = self.y_test_price #np.load('inputClf/y_test_price.npy') """ y_price = np.empty(shape=(0, 1)) for i in range(y_test_price.shape[0]): price = [util.getPrice(y_test_price[i, 0])] y_price = np.concatenate((y_price, [price]), axis=0) """ # feature 0~7: flight number dummy variables # feature 8: departure date; feature 9: observed date state; # feature 10: minimum price; feature 11: maximum price # fearure 12: prediction(buy or wait); feature 13: price evalMatrix = np.concatenate((X_test, y_pred, y_test_price), axis=1) # route index flightNum = self.routes.index(filePrefix) evalMatrix = evalMatrix[np.where(evalMatrix[:, flightNum]==1)[0], :] # group by the feature 8: departure date departureDates = np.unique(evalMatrix[:, 8]) departureLen = len(departureDates) latestBuyDate = 11 # define the latest buy date state totalPrice = 0 for departureDate in departureDates: state = latestBuyDate # update the state for every departure date evaluation global isFound # indicate whether some entries is predicted to be buy isFound = 0 for i in range(evalMatrix.shape[0]): # if no entry is buy, then buy the latest one if evalMatrix[i, 8] == departureDate and evalMatrix[i, 9] == latestBuyDate: latestPrice = evalMatrix[i, 13] # if many entries is buy, then buy the first one if evalMatrix[i, 8] == departureDate and evalMatrix[i, 9] >= state and evalMatrix[i, 12] == 1: isFound = 1 state = evalMatrix[i, 9] price = evalMatrix[i, 13] if isFound == 1: totalPrice += price else: totalPrice += latestPrice #print isFound avgPrice = totalPrice * 1.0 / departureLen print "One Time avg price: {}".format(avgPrice) return avgPrice def evaluateOneRouteForMultipleTimes(self, filePrefix="BCN_BUD", timesToRun=1): """ Rune the evaluation for the given route and run it multiple times(e.g. 100), to get the avarage performance :param filePrefix: route prefix :param timesToRun: the times to run the evaluation, and get the average. :return: average price """ # fit and predict self.training() self.predict() # route index flightNum = self.routes.index(filePrefix) timesToRun = 1 # if it is neural network, please change this number to 20 or more if self.isNN: timesToRun = 20 totalPrice = 0 for i in range(timesToRun): np.random.seed(i*i*i) # do not forget to set seed for the weight initialization price = self.evaluateOneRoute(filePrefix) totalPrice += price avgPrice = totalPrice * 1.0 / timesToRun """ Just use it one time - in the small dataset, I get these datas from hard code. If you need to use the large dataset, please use this version. """ # self.minPrices_train = util.getMinPriceForSpecific_train() # self.minPrices_test = util.getMinPriceForSpecific_test() # # self.randomPrices_train = util.getRandomPriceForSpecific_train() # self.randomPrices_test = util.getRandomPriceForSpecific_test() # # self.maxPrices_train = util.getMaxPriceForSpecific_train() # self.maxPrices_test = util.getMaxPriceForSpecific_test() if self.isTrain: #print "20 times avg price: {}".format(avgPrice) print "TRAIN:" print "minimumPrice: {}".format(self.minPrices_train[flightNum]) print "maximumPrice: {}".format(self.maxPrices_train[flightNum]) print "randomPrice: {}".format(self.randomPrices_train[flightNum]) print "avgPredPrice: {}".format(avgPrice) performance = (self.randomPrices_train[flightNum] - avgPrice) / self.randomPrices_train[flightNum] * 100 print "Performance: {}%".format(round(performance,2)) maxPerformance = (self.randomPrices_train[flightNum] - self.minPrices_train[flightNum]) / self.randomPrices_train[flightNum] * 100 print "Max Perfor: {}%".format(round(maxPerformance,2)) normalizedPefor = performance / maxPerformance * 100 print "Normalized perfor: {}%".format(round(normalizedPefor,2)) else: #print "20 times avg price: {}".format(avgPrice) print "TEST:" print "minimumPrice: {}".format(self.minPrices_test[flightNum]) print "maximumPrice: {}".format(self.maxPrices_test[flightNum]) print "randomPrice: {}".format(self.randomPrices_test[flightNum]) print "avgPredPrice: {}".format(avgPrice) performance = (self.randomPrices_test[flightNum] - avgPrice) / self.randomPrices_test[flightNum] * 100 print "Performance: {}%".format(round(performance,2)) maxPerformance = (self.randomPrices_test[flightNum] - self.minPrices_test[flightNum]) / self.randomPrices_test[flightNum] * 100 print "Max Perfor: {}%".format(round(maxPerformance,2)) normalizedPefor = performance / maxPerformance * 100 print "Normalized perfor: {}%".format(round(normalizedPefor,2)) #print "Minimum price: {}".format(minimumPrice) #print "Maximum price: {}".format(maximumPrice) #print "Random price: {}".format(randomPrice) return (performance, normalizedPefor) def evaluateAllRroutes(self): """ Evaluate all the routes, print the performance for every route and the average performance for all the routes. """ performance = 0 normalizedPerformance = 0 normPerforms = [] for i in range(8): print "Route: {}".format(i) [perfor, normaPerfor] = self.evaluateOneRouteForMultipleTimes(self.routes[i]) normPerforms.append(normaPerfor) performance += perfor normalizedPerformance += normaPerfor performance = round(performance/8, 2) normalizedPerformance = round(normalizedPerformance/8, 2) if self.isTrain: print "\nTRAIN:" else: print "\nTEST:" print "Average Performance: {}%".format(performance) print "Average Normalized Performance: {}%".format(normalizedPerformance) print "Normalized Performance Variance: {}".format(np.var(normPerforms)) """ Here, the two function is not in the class, use it separately to get the input for the outlier removal. """ def kmeansRemovingOutlierForClassifier(): """ use k-means to do outlier removal :return: NA """ # load data X_train = np.load('inputClf_small/X_train.npy') y_train = np.load('inputClf_small/y_train.npy') y_train_price = np.load('inputClf_small/y_train_price.npy') # cluster initializing X_train1 = X_train[np.where(y_train==0)[0], :] X_train2 = X_train[np.where(y_train==1)[0], :] cluster1 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train1) cluster1 = cluster1.cluster_centers_ cluster2 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train2) cluster2 = cluster2.cluster_centers_ clusters = np.concatenate((cluster1, cluster2), axis=0) y_pred = KMeans(init='random', n_clusters=2, random_state=2).fit_predict(X_train) y_pred = y_pred.reshape((y_pred.shape[0], 1)) y_pred = y_pred tmp = np.concatenate((y_train, y_pred), axis=1) sam = y_train == y_pred print "# total: {}".format(y_train.shape[0]) print "# datas left: {}".format(np.sum(sam)) # Keep 63.62% data. print "Keep {}% data.".format(round(np.sum(sam)*100.0/y_train.shape[0], 2)) print tmp[0:22, :] print np.where(y_train==y_pred)[0] # keep the data which are not outliers X_train = X_train[np.where(y_train==y_pred)[0], :] y_train_price = y_train_price[np.where(y_train==y_pred)[0], :] y_train = y_train[np.where(y_train==y_pred)[0], :] np.save('inputClf_KMeansOutlierRemoval/X_train', X_train) np.save('inputClf_KMeansOutlierRemoval/y_train', y_train) np.save('inputClf_KMeansOutlierRemoval/y_train_price', y_train_price) def gmmRemovingOutlierForClassifier(): """ use GMM model to remove outlier :return: NA """ # load data X_train = np.load('inputClf_small/X_train.npy') y_train = np.load('inputClf_small/y_train.npy') y_train_price = np.load('inputClf_small/y_train_price.npy') # classifier initialize classifier = GMM(n_components=2,covariance_type='full', init_params='wmc', n_iter=20) # cluster initializing X_train1 = X_train[np.where(y_train==0)[0], :] X_train2 = X_train[np.where(y_train==1)[0], :] cluster1 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train1) cluster1 = cluster1.cluster_centers_ cluster2 = KMeans(init='random', n_clusters=1, random_state=0).fit(X_train2) cluster2 = cluster2.cluster_centers_ clusters = np.concatenate((cluster1, cluster2), axis=0) classifier.means_ = clusters # Train the other parameters using the EM algorithm. classifier.fit(X_train) # predict y_train_pred = classifier.predict(X_train) train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 print "Keep {}% data.".format(train_accuracy) # keep the data which are not outliers y_train_pred = y_train_pred.reshape((y_train_pred.shape[0], 1)) X_train = X_train[np.where(y_train==y_train_pred)[0], :] y_train_price = y_train_price[np.where(y_train==y_train_pred)[0], :] y_train = y_train[np.where(y_train==y_train_pred)[0], :] np.save('inputClf_GMMOutlierRemoval/X_train', X_train) np.save('inputClf_GMMOutlierRemoval/y_train', y_train) np.save('inputClf_GMMOutlierRemoval/y_train_price', y_train_price) if __name__ == "__main__": gmmRemovingOutlierForClassifier() pass