'''Classify recipes into regional cuisines based on ingredients or flavors, using logistic regresion, SVM, randomforest, MultinomialNB, and plot confusion_matrix ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import cPickle as pickle from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.naive_bayes import MultinomialNB from sklearn import cross_validation, grid_search from sklearn.cross_validation import KFold, train_test_split from sklearn import metrics from sklearn.metrics import confusion_matrix, classification_report def logistic_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'First round:',metrics.accuracy_score(y_test,y_pred) #tune parameter C crange =[0.01,0.1,1,10,100] for num in crange: model = LogisticRegression(C=num) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred) def svm_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = svm.LinearSVC(C=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'First round:',metrics.accuracy_score(y_test,y_pred) #tune parameter C crange =[0.01,0.1,1,10,100] for num in crange: model = svm.LinearSVC(C=num) model.fit(X_train, y_train) y_pred = model.predict(X_test) print 'C=', num, ',score=', metrics.accuracy_score(y_test,y_pred) def nb_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1) model = MultinomialNB() model.fit(X_train, y_train) y_pred = model.predict(X_test) print metrics.accuracy_score(y_test,y_pred) def rf_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) rf_model = RandomForestClassifier(n_estimators = 100, n_jobs=-1) rf_model.fit(X_train, y_train) y_pred = rf_model.predict(X_test) print metrics.accuracy_score(y_test,y_pred) #plot confusion_matrix, 'col' is the y target def plot_confusion_matrix(cm, col, title, cmap=plt.cm.viridis): plt.imshow(cm, interpolation='nearest', cmap=cmap) for i in range(cm.shape[0]): plt.annotate("%.2f" %cm[i][i],xy=(i,i), horizontalalignment='center', verticalalignment='center') plt.title(title,fontsize=18) plt.colorbar(fraction=0.046, pad=0.04) tick_marks = np.arange(len(col.unique())) plt.xticks(tick_marks, sorted(col.unique()),rotation=90) plt.yticks(tick_marks, sorted(col.unique())) plt.tight_layout() plt.ylabel('True label',fontsize=18) plt.xlabel('Predicted label',fontsize=18) #using flavor network to project recipes from ingredient matrix to flavor matrix def flavor_profile(df,ingr,comp,ingr_comp): sorted_ingredients = df.columns underscore_ingredients=[] for item in sorted_ingredients: underscore_ingredients.append(item.replace(' ','_')) print len(underscore_ingredients), len(sorted_ingredients) ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id') ingr_total = ingr_total.join(comp,how='right',on='compound id') ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id']) ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)] df_flavor = df.values.dot(ingr_flavor.values) print df.shape, df_flavor.shape return df_flavor #normalize flavor matrix with tfidf method def make_tfidf(arr): '''input, numpy array with flavor counts for each recipe and compounds return numpy array adjusted as tfidf ''' arr2 = arr.copy() N=arr2.shape[0] l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1) l2_rows[l2_rows==0]=1 arr2_norm = arr2/l2_rows arr2_freq = np.sum(arr2_norm>0, axis=0) arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0 from sklearn.preprocessing import normalize tfidf = np.multiply(arr2_norm, arr2_idf) tfidf = normalize(tfidf, norm='l2', axis=1) print tfidf.shape return tfidf if __name__ == '__main__': #read pickled dataframe yum_clean = pd.read_pickle('data/yummly_clean.pkl') #create a set of all ingredients in the dataframe yum_ingredients=set() yum_clean['clean ingredients'].map(lambda x: [yum_ingredients.add(i) for i in x]) print len(yum_ingredients) #create one column for each ingredient, True or False yum = yum_clean.copy() for item in yum_ingredients: yum[item] = yum['clean ingredients'].apply(lambda x:item in x) yum_X = yum.drop(yum_clean.columns,axis=1) #test various classification models logistic_test(yum_X, yum['cuisine']) #C=1 gave the best result, accuracy 0.69 svm_test(yum_X,yum['cuisine']) #linear svm C=0.1 gave the best result, accuracy 0.70 nb_test(yum_X,yum['cuisine']) #accuracy is 0.64 rf_test(yum_X,yum['cuisine']) #accuracy is 0.64 #plot confusion_matrix with svm X = yum_X.values y = yum['cuisine'] X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = svm.LinearSVC(C=0.1) model.fit(X_train, y_train) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.figure(figsize=(10,10)) plot_confusion_matrix(cm_normalized, yum['cuisine'],title='Confusion Matrix based on ingredients') #read pickled dataframe yum_ingr = pd.read_pickle('data/yum_ingr.pkl') yum_tfidf = pd.read_pickle('data/yum_tfidf.pkl') #plot confusion matrix for flavor-based classification X = yum_tfidf.values y = yum_ingr['cuisine'] X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10) model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] plt.figure(figsize=(10,10)) plot_confusion_matrix(cm_normalized, yum_ingr['cuisine'],title='Confusion Matrix based on flavor')