#!/usr/bin/env python # this part is usnig the SVM to train the datasets # in sum, we will build a feature vector for each app which represents the behavior and structure properties. import sys import os import matplotlib.pyplot as plt import numpy as np import signal import sklearn from sklearn import svm,datasets from sklearn.cross_validation import train_test_split,cross_val_score,StratifiedKFold from sklearn.metrics import * from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import * from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn import linear_model from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif from AnalysisCore import * #print(__doc__) #this part is reading benign parts def _read_lists(filename): data = np.loadtxt(filename) x_len, y_len = data.shape #print data.shape X = data[:, : (y_len-1)] y = data[:, y_len-1] return X,y def _read_lists_vectors(filename): data = np.loadtxt(filename) x_len, y_len = data.shape y = [0]* x_len return data,y #this function is using to write a list into files def write_list_into_files(list_n, f): for i in range(0,len(list_n),1): f.write("%f " % list_n[i]) f.write("\n") def _dataset_sample(): iris = datasets.load_iris() X = iris.data y = iris.target return X,y ################## this is to find the best feature selection############### def find_best_feature_selections(X,y): #select the best features usin different technique X_new = SelectKBest(chi2, k=80).fit_transform(X,y) X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y) X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y) X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y) X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y) print (X_new.shape) #selection_parameters_for_classfier(X_new,y) #print (y.shape) train_and_test(X_new,y) train_and_test(X_new1,y) train_and_test(X_new2,y) train_and_test(X_new22,y) train_and_test(X_new3,y) train_and_test(X_new4,y) #X,y = _dataset_sample() ################################PARAMETER Selected################################ #TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest """ Results for recommendation: parameters selecting to make the onesubgraph works better KNN :(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') +1 nbg = GaussianNB() nbm = MultinomialNB() nbb = BernoulliNB() performs best -1 decision tree estimator=DecisionTreeClassifier( criterion='gini',max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') -1 random forest estimator=RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0 +1 SVM estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) +1 """ # we make each classifier's parameters as the dic and put it into grid search to find the best parameter required def selection_parameters_for_classfier(X,y): from sklearn import grid_search #paras={ 'n_neighbors':[1,10], 'weights':['uniform', 'distance'], 'algorithm':['auto', 'ball_tree','kd_tree', 'brute'], 'leaf_size':[20,50]} #knn = KNeighborsClassifier() #naive_bayes #nbg = GaussianNB() #nbm = MultinomialNB() #nbb = BernoulliNB() #decision tree #paras={ 'criterion':['gini','entropy'], 'splitter':['random', 'best'], 'max_features':[None, 'auto','sqrt', 'log2'], 'min_samples_split':[1,10]} #dtree = DecisionTreeClassifier() #random forest #rforest = RandomForestClassifier() #paras={ 'n_estimators':[2,15], 'criterion':['gini','entropy'], 'max_features': ['auto','sqrt', 'log2'], 'min_samples_split':[1,10]} #svm svmm = svm.SVC() paras={'kernel':['rbf','linear','poly']} clt =grid_search.GridSearchCV(svmm, paras, cv=5) clt.fit(X,y) print (clt) #print (clt.get_params()) print (clt.set_params()) print (clt.score(X,y)) #scores = cross_val_score(clt,X,y,cv=10) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #this is to get score using cross_validation def get_scroe_using_cv(clt, X, y): scores = cross_val_score(clt,X,y,cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #just want to draw a confusion matrix to make it look fantanstic def draw_confusion_matrix(y_test, y_pred): from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) # Show confusion matrix in a separate window plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() ####################10 CV FALSE POSITIVE FLASE NEGATIVe################################################# def my_get_fp_fn_CV(X_original,y): #generate classfiers knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #decision tree dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best') #naive #nbbern = BernoulliNB() #random forest rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #svm svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #reduce the size #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) print ("KNN") my_get_fp_fn_inter(knn,X,y) print ("DTree") my_get_fp_fn_inter(dtree,X,y) print ("rforest") my_get_fp_fn_inter(rforest,X,y) #print ("naive bayes") #my_get_fp_fn_inter(nbbern,X,y) print ("SVMrbf") my_get_fp_fn_inter(svmrbf,X,y) def my_get_fp_fn_inter(clt,X,y): # 10 fold validation skf = StratifiedKFold(y, 10) fn_v =[] fp_v =[] accu_v =[] for train_index, test_index in skf: X_train,X_test = X[train_index], X[test_index] y_train,y_test = y[train_index], y[test_index] y_predict = clt.fit(X_train,y_train).predict(X_test) # 1 is malicious 0 is bening y_predict_tolist= y_predict.tolist() y_test_tolist = y_test.tolist() fn = 0 fp = 0 accu = 0 N = 0 P = 0 for i in range(0,len(y_predict_tolist)): if int(y_test_tolist[i]) == 1 and int(y_predict_tolist[i]) == 0: #false nagative fn =fn +1 if int(y_test_tolist[i]) == 0 and int(y_predict_tolist[i]) == 1: fp = fp+1 if int(y_test_tolist[i]) == 0: N = N +1 if int(y_test_tolist[i])== 1: P = P +1 if int(y_test_tolist[i]) == int(y_predict_tolist[i]): accu = accu + 1 accu_score = float(accu+0.0)/len(y_predict_tolist) fn_score =float(fn+0.0)/N fp_score =float(fp+0.0)/P fn_v.append(fn_score) fp_v.append(fp_score) accu_v.append(accu_score) fn_array= np.array(fn_v) fp_array=np.array(fp_v) accu_array = np.array(accu_v) print("fn: %0.4f (+/- %0.4f)" % (fn_array.mean(), fn_array.std())) print("fp: %0.4f (+/- %0.4f)" % (fp_array.mean(), fp_array.std())) print("Accuracy: %0.4f (+/- %0.4f)" % (accu_array.mean(), accu_array.std())) ##################################################################### #this part is using for trainig and test to see different cv score def train_and_test(X,y): #KNN knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #naive-bayees nbbern = BernoulliNB() #decision tree dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best') #random forest rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #svm svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) get_scroe_using_cv(knn, X, y) get_scroe_using_cv(nbbern, X, y) get_scroe_using_cv(dtree, X, y) get_scroe_using_cv(rforest, X, y) get_scroe_using_cv(svmrbf, X, y) print ("\n") ###################################################################### #this is to draw the Roc curve example by splitting the dataset #just want a figure to make it more beautiful def get_fpr_tpr(clt, X, y): random_state = np.random.RandomState(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25 , random_state = 0) #from sklearn import tree #clt = tree.DecisionTreeClassifier( criterion='entropy', min_samples_leaf=2, min_samples_split=2, random_state=None, splitter='best') clt = clt.fit(X_train,y_train) #from sklearn.externals.six import StringIO #with open("iris_plus.dot", 'w') as f: # f = tree.export_graphviz(clt, out_file=f) y_pred = clt.predict(X_test) #accuracy score _accuracy_score = accuracy_score(y_test, y_pred) print ("Accuracy score {}".format(_accuracy_score)) #roc curve probas_ = clt.predict_proba(X_test) #print (probas_) #draw_confusion_matrix(y_test,y_pred) #print probas_ fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1]) #print (fpr, tpr,thresholds) roc_auc = auc(fpr, tpr) print ("Area under the ROC curve : %f" % roc_auc) return fpr, tpr , roc_auc # this is used to draw def get_my_pecision_recall(clt, X, y): random_state = np.random.RandomState(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25 , random_state = 0) clt =clt.fit(X_train,y_train) probas_ = clt.predict_proba(X_test) precision, recall, _ = precision_recall_curve(y_test, probas_[:, 1]) auc_pr = auc(recall, precision) print ( "precision score :%f" %auc_pr) return precision, recall, auc_pr ######################################FINAL RESULT SHOW PERFORMANCE################################### """ after preparation, we select 4 different machine learning technique and 80 features with f_classif approach also cv score selected """ def final_train_and_test_after_preparation(X_original,y): #KNN knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #decision tree dtree = DecisionTreeClassifier( criterion='entropy', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best') #random forest rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #svm svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #naive bayes #nbb = BernoulliNB() X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) #X = X_original print (X.shape) #get_scroe_using_cv(knn, X, y) #get_scroe_using_cv(dtree, X, y) #get_scroe_using_cv(rforest, X, y) #get_scroe_using_cv(svmrbf, X, y) fpr_knn, tpr_knn, auc_knn = get_fpr_tpr(knn, X, y) fpr_dtree, tpr_dtree, auc_dtree = get_fpr_tpr(dtree, X, y) fpr_rforest, tpr_rforest, auc_rforest = get_fpr_tpr(rforest, X, y) fpr_svmrbf, tpr_svmrbf ,auc_svmrbf= get_fpr_tpr(svmrbf, X, y) #fpr_nbb, tpr_nbb ,auc_nbb= get_fpr_tpr(nbb, X, y) plt.clf() plt.plot(fpr_svmrbf, tpr_svmrbf, 'y.--', label ='SVM AUC=%0.4f'% auc_svmrbf) plt.plot(fpr_knn, tpr_knn, 'r^--', label='KNN AUC=%0.4f' %auc_knn) plt.plot(fpr_dtree, tpr_dtree, 'b>--', label ='D.Tree AUC=%0.4f'% auc_dtree) plt.plot(fpr_rforest, tpr_rforest, 'go--', label ='R.Forest AUC=%0.4f'% auc_rforest) #plt.plot(fpr_nbb, tpr_nbb, 'c*--', label ='Random Forest auc=%0.4f'% auc_nbb) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([-0.02, 1.02]) plt.ylim([-0.02, 1.02]) plt.xlabel('FPR(False Positive Rate)',fontsize=20) plt.ylabel('TPR(True Positive Rate)',fontsize=20) #plt.title('Receiver operating characteristic ') plt.legend(loc="lower right") plt.tight_layout() plt.grid() plt.show() del X del y ##################################DRAW P-R CURVE####################################################### ############3 this is the precisio and recall curve def precision_recall_curve_draw(X_o,y): X = SelectKBest(f_classif, k=80).fit_transform(X_o,y) print (X.shape) print (y.shape) svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y) p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y) p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y) p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y) plt.clf() plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf) plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn) plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree) plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('recall rate') plt.ylabel('precision rate') plt.title('precision-recall curve') plt.legend(loc="lower right") plt.show() del X del y ###############################################Examples to show the difference of features representation ################################## from FeatureList import * from androguard.core.bytecodes.dvm_permissions import * # f is the apk file name def features_representation_difference(X_original,y,f): rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #rforest= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, #shrinking=True, tol=0.001, verbose=False) skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) #print (skb.get_support(indices=False)) rforest.fit(X,y) my_get_fp_fn_inter(rforest,X,y) features_80_ = skb.get_support(indices=False).tolist() len1 = len(features_80_) #242 feature_map = {} key = 0 for i in range(len1): if (features_80_[i]): feature_map[key]=i key=key+1 #############description about features Manifest = str("MANIFEST_PERMISSION") permission_category =['normal','dangerous','signature','signatureOrSystem'] length = len(DVM_PERMISSIONS[Manifest]) permission_des = ['1']*length r =0 for i in DVM_PERMISSIONS[Manifest].keys(): permission_des[r]=i r = r+1 #print (len(permission_des)) #137 permissions + 1 sum +1 occu #print (len(permission_category)) # 4 permission category + 1 sum +1 occu #print (len(karim_api_list)) # 57 APIs + 1 sum + 1 occu #print (len(user_action_list)) # 35 user actions + 1 sum +1 occu description_vector = [] description_vector = list(permission_des) + ['per sum'] + ['per occu'] description_vector = description_vector + list(permission_category) + ['per_cat sum'] + ['per_cat occu'] description_vector = description_vector + list(APIList) + ['api sum'] + ['api occu'] description_vector = description_vector + list(user_action_list) + ['action sum'] + ['action occu'] description_vector = description_vector + ['cover rate'] print (f) print ("total featuers with description # {}".format(len(description_vector))) #feature_map =sorted(feature_map) #print (feature_map) print ("induced features into #{} with importances rates".format(len(rforest.feature_importances_.tolist()))) feature_importance_v = rforest.feature_importances_.tolist() sorted_feature_imp = sorted(feature_importance_v , key=float, reverse=True) indices = [i[0] for i in sorted(enumerate(feature_importance_v), key=lambda x:x[1])] new_app = newStart(f) bfea = build_features_each_app(new_app, option=True) if len(bfea.feature_dic.keys()) > 1 : vv = [] vv_len = [] p_transformed_dic = {} for j in bfea.feature_dic.keys(): vector = bfea.feature_dic[j] print (vector) classification_vector = skb.transform(vector) if (occupy_vector_all_zero(classification_vector)): print ("all zero vector detected") t = 0 else: t = int(rforest.predict(skb.transform(vector))) print (t) #print (rforest.predict_proba(skb.transform(vector)).tolist()) p_transformed_dic[j] = skb.transform(vector) vv.append(t) vv_len.append(len(bfea.split_nodelist[j])) print (bfea.split_nodelist[j]) sub_rate = calculate_sub_mali_rate(vv) print ("predict dis: {} len {} sub_rate{}".format(vv,vv_len,sub_rate)) bfea_non = build_features_each_app(new_app, option=False) t = int(rforest.predict(skb.transform (bfea_non.feature_dic[0]))) nonp_transformed_v = skb.transform (bfea_non.feature_dic[0]) print ("non predict: {}\n".format(t)) v1 = p_transformed_dic[0].tolist()[0] v2 = p_transformed_dic[1].tolist()[0] v3 = nonp_transformed_v.tolist()[0] #to compute the feature difference just focus on Apps with two subgraphs for r in indices: imp_value = rforest.feature_importances_.tolist()[r] impp = float("{:10.4f}".format(imp_value)) ind = sorted_feature_imp.index(imp_value) p1 = float("{:10.4f}".format(v1[r])) p2 = float("{:10.4f}".format(v2[r])) p3 = float("{:10.4f}".format(v3[r])) index = feature_map[r] print (str(p1) + " " + str(p2) + " Nonbased " +str(p3) + " || "+ str(feature_map[r]) + " " + description_vector[index] + " || " + " weight "+ str(ind)+ " value " +str(impp) ) else: t = int(rforest.predict(skb.transform (bfea.feature_dic[0]))) nonp_transformed_v = skb.transform (bfea_non.feature_dic[0]) v3 = nonp_transformed_v.tolist()[0] for r in indices: imp_value = rforest.feature_importances_.tolist()[r] impp = float("{:10.4f}".format(imp_value)) ind = sorted_feature_imp.index(imp_value) p3 = float("{:10.4f}".format(v3[r])) index = feature_map[r] print (str(p3) + " || "+ str(feature_map[r]) + " " + description_vector[index] + " || " + " weight "+ str(ind)+ " value " +str(impp) ) #print (bfea.feature_dic[0]) print (t) ################################## FInal To show the predicting on multiple subgraphs ############### #test whether it is repackaged/have benign componnets/malicious components def predict_new_app(y_pred): x = None #t = len(y_pred) if 0 in y_pred and 1 not in y_pred: x = 0 if 1 in y_pred and 0 not in y_pred: x = 1 if 1 in y_pred and 0 in y_pred: x = 2 return x def calculate_mali_rate(vv, vv_len): assert isinstance(vv, list) assert isinstance(vv_len, list) total_len = 0 mali_len = 0 rate = 0.0 for i in range(len(vv)): total_len = total_len + vv_len[i] if vv[i] == 1: mali_len = mali_len + vv_len[i] rate = float (mali_len +0.0)/total_len ben_len = total_len - mali_len return (rate, mali_len, ben_len) def calculate_sub_mali_rate(vv): assert isinstance(vv, list) total_v = 0 mal_v = 0 for i in range(len(vv)): total_v = total_v +1 if vv[i] == 1: mal_v = mal_v + 1 rate = float(mal_v +0.0)/total_v return (rate) def occupy_vector_all_zero(_list): for i in _list: if i.any() > 0: return False return True import signal def handler(signum,frame): print ("time out") raise RuntimeError import time def predict_mulitple_subgraphs(X_original,y): time_start = time.time() #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #rforest = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #rforest = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #rforest = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) print (skb.get_support(indices=False)) rforest.fit(X,y) #my_get_fp_fn_inter(rforest,X,y) #m_secs =(time.time() - time_start)*1000 #print ("training mi-seconds {}".format(m_secs)) f_test_new_released = 'apks/' files = get_filepaths(f_test_new_released) subgraph_property(files, rforest, skb) import gc def subgraph_property(files, clt , skb): files = sorted(files, key=str.lower) files = (files[0:50]) subgraph_num = [] malicious_rate = [] string = "DESCRIBE_YOUR_APKS" subgraph_num_file = str(string) + "_subgraph_number.txt" malicious_rate_file = str(string) + "_malicious_rate.txt" for f in files: print ("{0}".format(f)) new_app = None bfea = None if f in _ExceptList : continue signal.signal(signal.SIGALRM,handler) try: #signal.alarm(0) new_app = newStart(f) bfea = build_features_each_app(new_app, option=True) del new_app if len(bfea.feature_dic.keys()) > 1: vv = [] vv_len = [] for j in bfea.feature_dic.keys(): vector = bfea.feature_dic[j] classification_vector = skb.transform(vector) if (occupy_vector_all_zero(classification_vector)): t = 0 else: t = int(clt.predict(skb.transform(vector))) vv.append(t) vv_len.append(len(bfea.split_nodelist[j])) sub_rate = calculate_sub_mali_rate(vv) #if sub_rate > 0: # _f_mul_mal_nouseraction.write(f+'\n') #if (sub_rate > 0): ss = "predict dis: {} #classes-dis:{} sub_rate{}".format(vv,vv_len,sub_rate) print (ss) #sub_num_multi_v.append(len(vv)) #sub_num_all_v.append(len(vv)) #test_vector.append(sub_rate) subgraph_num.append(len(vv)) malicious_rate.append(sub_rate) #bfea_non = build_features_each_app(new_app, option=False) #t = int(clt.predict(skb.transform (bfea_non.feature_dic[0]))) #non_test_vector.append(t) #rate_non_partition_multiple.append(t) #rate_non_partition_all.append(t) #print ("non predict: {}\n".format(t)) else: t = int(clt.predict(skb.transform (bfea.feature_dic[0]))) #sub_num_all_v.append(1) subgraph_num.append(1) malicious_rate.append(t) #if (t > 0): ss = " single {}".format(t) print (ss) #if t > 0 : # _f_single_mal_no_useraction.write(f+'\n') #test_vector.append(float(t)) del bfea except: print ("failed") del new_app, bfea continue #signal.alarm(0) gc.collect() fnew = open(subgraph_num_file,'w') write_list_into_files(subgraph_num, fnew) fnew.close() fnew = open( malicious_rate_file ,'w') write_list_into_files(malicious_rate, fnew) fnew.close() #################################################################################### def feature_importances(X,y): # the output does not stable because of the randomness # Build a classification task using 3 informative features #X, y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_redundant=0,n_repeated=0,n_classes=2,n_state=0,shuffle=False) # Build a forest and compute the feature importances from sklearn.ensemble import ExtraTreesClassifier forest = ExtraTreesClassifier(n_estimators= 25, criterion = 'entropy' , random_state=None) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0) indices = np.argsort(importances)[::-1] # print (indices) # Print the feature ranking print("Feature ranking:") sum1 = 0.0 for f in range(80): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) sum1 = sum1 + importances[indices[f]] print (sum1) # Plot the feature importances of the forest #width = 0.5 x_len = range(len(importances)) plt.figure() plt.title("Feature importances") plt.bar(x_len, importances[indices] ,color="r", yerr=std[indices], align="center") plt.xticks(x_len, indices) plt.xlim([-1, max(x_len)+1]) plt.show() ######################################READ DATA#################################################### def read_data_onesubgraph(): file1 = 'data/z_benign_doublechecked_one_subgraph.txt' #994 file4 = 'data/z_benign_doublechecked_new_one_subgraph.txt' #825 file2 = 'data/z_mali_genome_one_subgraph.txt' #409 file3 = 'data/z_mali_virus_share_one_subgraph.txt' #1097 X1,y1 =_read_lists(file1) X4,y4 =_read_lists(file4) X2,y2 =_read_lists(file2) X3,y3 =_read_lists(file3) y11 = [0]*len(y1) y44 = [0]*len(y4) y22 = [1]*len(y2) y33 = [1]*len(y3) X = np.concatenate((X1,X2,X3,X4), axis = 0) y = np.concatenate((y11,y22,y33,y44), axis = 0) print ("************") print ("The training data size: {}".format(X.shape)) del file1,file2,file3,file4 del X1,X2,X3,y1,y2,y3,y11,y22,y33 return X,y def runML(): X, y =read_data_onesubgraph() feature_importances(X,y) #my_get_fp_fn_CV(X,y) final_train_and_test_after_preparation(X,y) precision_recall_curve_draw(X,y) if __name__ == "__main__": X, y =read_data_onesubgraph() feature_importances(X,y) #my_get_fp_fn_CV(X,y) final_train_and_test_after_preparation(X,y) precision_recall_curve_draw(X,y)