'''Clean up recipes from yummly, to produce ingredient lists and flavor lists, and normalize flavor profile using tfidf technique Pickle the dataframe after clean up as 'yummly_clean.pkl' Pickle the dataframe projected into the ingredient space in the flavor network as 'yummly_ingr.pkl' ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns from nltk.stem import WordNetLemmatizer from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import word_tokenize import re import cPickle as pickle import itertools from collections import Counter #take the long string in 'ingredients', lemmatize, regex, and split into words def split_ingr(x): wnl=WordNetLemmatizer() cleanlist=[] lst = x.strip('[]').split(',') cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst] return cleanlist #remove low-information words from ingredients, could use more def remove_word(word): alist =['low fat', 'reduced fat', 'fat free', 'fatfree', 'nonfat','gluten free', 'free range',\ 'reduced sodium', 'salt free','sodium free', 'low sodium', 'sweetened','unsweetened','large','extra large','oz '] for item in alist: word = word.replace(item,'') return word #match ingredients in yummly recipes to ingredients in graph; filter out those with >3 missing matches def cleanup_ingredients(ingr,df,col): df_ingr = set() df[col].map(lambda x: [df_ingr.add(i) for i in x]) long_ingredients = filter(lambda x: ' ' in x, ingr) short_ingredients = ingr - set(long_ingredients) df_dic={} for item in long_ingredients: for key in df_ingr: if item in key: if key not in df_dic: df_dic[key] = [item] else: df_dic[key].append(item) for item in short_ingredients: for key in df_ingr: if item in key.split(): if key not in df_dic: df_dic[key] = [item] else: df_dic[key].append(item) diff_dic = df_ingr - set(df_dic.keys()) df_dic = tweak_dic(df_dic, diff_dic) diff_dic = df_ingr - set(df_dic.keys()) print 'length of ingredients, matched ingredients, missed ingredients' print len(df_ingr), len(df_dic.keys()), len(diff_dic) df2 = df.copy() df2['len_diff'] = df2[col].apply(lambda x: count_missing(x,df_dic)) df2['match ingredients'] = df2[col].apply(lambda x: ingr_replace(x,df_dic)) df2['len_match'] = df2['match ingredients'].apply(lambda x: len(x)) #remove entries with less match ingredients or no matching ingr_ingredients df3 = df2[(df2['len_diff']<3) & (df2['len_match']!=0)] print 'dataframe shape before and after filtering' print df2.shape, df3.shape #sort ingredients set for later matching to flavor match_ingr = set() df3['match ingredients'].map(lambda x: [match_ingr.add(i) for i in x]) sorted_ingr = sorted(list(match_ingr)) #create columns for each ingredient df4 = df3.copy() for item in sorted_ingr: df4[item] = df4['match ingredients'].apply(lambda x:item in x) df_X = df4.drop(df3.columns, axis=1) return df4, df_X #after direct string matching, catch some spelling differences through this def tweak_dic(df_dic, diff_df): alist = ['chile', 'chili','chilies','chilli','sriracha'] for pepper in alist: for item in filter(lambda x: pepper in x, diff_df): if item not in df_dic: df_dic[item] = ['tabasco pepper'] for item in filter(lambda x: 'flour' in x, diff_df): if item not in df_dic: df_dic[item] = ['whole grain wheat flour'] for item in filter(lambda x: 'tumeric' in x, diff_df): if item not in df_dic: df_dic[item] = ['turmeric'] for item in filter(lambda x: 'yoghurt' in x, diff_df): if item not in df_dic: df_dic[item] = ['yogurt'] for item in filter(lambda x: 'sausage' in x, diff_df): if item not in df_dic: df_dic[item] = ['smoked sausage'] alist = ['rib','chuck','sirloin','steak'] for beef in alist: for item in filter(lambda x: beef in x, diff_df): if item not in df_dic: df_dic[item] = ['beef'] for item in filter(lambda x: 'fillet' in x, diff_df): if item not in df_dic: df_dic[item] = ['raw fish'] for item in filter(lambda x: 'mozzarella' in x, diff_df): if item not in df_dic: df_dic[item] = ['mozzarella cheese'] for item in filter(lambda x: 'spinach' in x, diff_df): if item not in df_dic: df_dic[item] = ['dried spinach'] for item in filter(lambda x: 'curry' in x, diff_df): if item not in df_dic: df_dic[item] = ['coriander','turmeric','cumin','cayenne'] return df_dic #Count missing ingredients after matching; salt, sugar, water and oil are not in the flavor network and thus don't count as missing def count_missing(lst, df_dic): cnt = 0 for item in lst: if item in df_dic: cnt+=1 elif 'salt' in item.split(): cnt+=1 elif 'sugar' in item.split(): cnt+=1 elif 'water' in item.split(): cnt+=1 elif 'oil' in item.split(): cnt+=1 return len(lst) - cnt #After making dictionary to map ingredients from yummly recipes to ingredients in the flavor network, this is to replace ingredients in recipes with ingredients in the flavor network def ingr_replace(lst, df_dic): temp = set() for item in lst: if item in df_dic: temp.update(df_dic[item]) return temp #using flavor network to project recipes from ingredient matrix to flavor matrix def flavor_profile(df,ingr,comp,ingr_comp): sorted_ingredients = df.columns underscore_ingredients=[] for item in sorted_ingredients: underscore_ingredients.append(item.replace(' ','_')) print len(underscore_ingredients), len(sorted_ingredients) ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id') ingr_total = ingr_total.join(comp,how='right',on='compound id') ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id']) ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)] df_flavor = df.values.dot(ingr_flavor.values) print df.shape, df_flavor.shape return df_flavor #normalize flavor matrix with tfidf method def make_tfidf(arr): '''input, numpy array with flavor counts for each recipe and compounds return numpy array adjusted as tfidf ''' arr2 = arr.copy() N=arr2.shape[0] l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1) l2_rows[l2_rows==0]=1 arr2_norm = arr2/l2_rows arr2_freq = np.sum(arr2_norm>0, axis=0) arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0 from sklearn.preprocessing import normalize tfidf = np.multiply(arr2_norm, arr2_idf) tfidf = normalize(tfidf, norm='l2', axis=1) print tfidf.shape return tfidf if __name__ == '__main__': yum = pd.read_pickle('data/yummly.pkl') #drop duplicates yum = yum.drop_duplicates(['id'], keep='first') #drop low ratings yum = yum[yum['rating']>2] #drop dishes such as dessert and sauce yum = yum[yum['course']!='[Desserts]'] yum = yum[yum['course']!='[Condiments and Sauces]'] #clean up cuisine labels yum['cuisine']= yum['cuisine'].apply(lambda x: x.strip('[]')) cuisine_dic = {'Thai, Asian': 'Thai', 'Chinese, Asian':'Chinese', 'Japanese, Asian':'Japanese', 'Southern & Soul Food, American': 'Southern & Soul Food', 'Mediterranean, Greek': 'Mediterranean', 'Cajun & Creole, Southern & Soul Food, American': 'Southern & Soul Food', 'Asian, Japanese': 'Japanese','Cajun & Creole, American': 'Cajun & Creole', 'Hawaiian, American': 'Hawaiian', 'Asian, Thai': 'Thai', 'American, Cuban':'Cuban', 'Greek, Mediterranean': 'Greek', 'Indian, Asian': 'Indian','Asian, Chinese':'Chinese', 'American, Kid-Friendly': 'American', 'Spanish, Portuguese':'Spanish', 'Mexican, Southwestern': 'Mexican', 'Southwestern, Mexican': 'Southwestern', 'American, Southern & Soul Food': 'Southern & Soul Food', 'Cajun & Creole, Southern & Soul Food': 'Southern & Soul Food', 'Portuguese, American':'American','American, French': 'American', 'American, Cajun & Creole':'American', 'American, Cajun & Creole, Southern & Soul Food': 'American', 'Irish, American':'American' } yum['cuisine'] = yum['cuisine'].apply(lambda x: cuisine_dic[x] if x in cuisine_dic else x) #remove some cusines with few dishes subcuisine = list(yum['cuisine'].value_counts().index[:25]) yum = yum[yum['cuisine'].isin(subcuisine)] #clean up ingredients and create list yum['clean ingredients'] = yum['ingredients'].apply(lambda x: split_ingr(x)) yum['clean ingredients'] = yum['clean ingredients'].apply(lambda x:[remove_word(word) for word in x]) yum.to_pickle('data/yummly_clean.pkl') #make list and set for all ingredients yum_lst = list(itertools.chain(*(yum['clean ingredients'].tolist()))) yum_ingr = set(yum_lst) print len(yum_lst), len(yum_ingr) #load ingr and comp information for the flavor network comp = pd.read_csv('data/comp_info.tsv',index_col=0,sep='\t') ingr_comp = pd.read_csv('data/ingr_comp.tsv',sep='\t') ingr = pd.read_csv('data/ingr_info.tsv',index_col=0,sep='\t') ingr['space ingredients']= ingr['ingredient name'].apply(lambda x: x.replace('_',' ') ) ingr_ingredients = set() ingr['space ingredients'].map(lambda x: ingr_ingredients.add(x)) print len(ingr_ingredients) #clean up ingredients and get two dataframes yum_ingr, yum_X = cleanup_ingredients(ingr_ingredients, yum, 'clean ingredients') #pickle the dataframe yum_ingr and yum_X yum_ingr.to_pickle('data/yummly_ingr.pkl') yum_X.to_pickle('data/yummly_ingrX.pkl') #get flavor profile yum_flavor = flavor_profile(yum_X, ingr, comp, ingr_comp) #make tfidf from flavor profile yum_tfidf = make_tfidf(yum_flavor) #pickle numpy array as dataframes pd.DataFrame(yum_flavor).to_pickle('data/yum_flavor.pkl') pd.DataFrame(yum_tfidf).to_pickle('data/yum_tfidf.pkl')