'''Clean up recipes from yummly, to produce ingredient lists and flavor lists, and normalize flavor profile using tfidf technique
Pickle the dataframe after clean up as 'yummly_clean.pkl'
Pickle the dataframe projected into the ingredient space in the flavor network as 'yummly_ingr.pkl'
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import re
import cPickle as pickle
import itertools
from collections import Counter

#take the long string in 'ingredients', lemmatize, regex, and split into words
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more
def remove_word(word):
    alist =['low fat', 'reduced fat', 'fat free', 'fatfree', 'nonfat','gluten free', 'free range',\
            'reduced sodium', 'salt free','sodium free', 'low sodium', 'sweetened','unsweetened','large','extra large','oz ']
    for item in alist:
        word = word.replace(item,'')
    return word

#match ingredients in yummly recipes to ingredients in graph; filter out those with >3 missing matches
def cleanup_ingredients(ingr,df,col):

    df_ingr = set()
    df[col].map(lambda x: [df_ingr.add(i) for i in x])

    long_ingredients = filter(lambda x: ' ' in x, ingr)
    short_ingredients = ingr - set(long_ingredients)
    df_dic={}

    for item in long_ingredients:
        for key in df_ingr:
            if item in key:
                if key not in df_dic:
                    df_dic[key] = [item]
                else:
                    df_dic[key].append(item)

    for item in short_ingredients:
        for key in df_ingr:
            if item in key.split():
                if key not in df_dic:
                    df_dic[key] = [item]
                else:
                    df_dic[key].append(item)

    diff_dic = df_ingr - set(df_dic.keys())

    df_dic = tweak_dic(df_dic, diff_dic)
    diff_dic = df_ingr - set(df_dic.keys())
    print 'length of ingredients, matched ingredients, missed ingredients'
    print len(df_ingr), len(df_dic.keys()), len(diff_dic)

    df2 = df.copy()
    df2['len_diff'] = df2[col].apply(lambda x: count_missing(x,df_dic))
    df2['match ingredients'] = df2[col].apply(lambda x: ingr_replace(x,df_dic))
    df2['len_match'] = df2['match ingredients'].apply(lambda x: len(x))
    #remove entries with less match ingredients or no matching ingr_ingredients
    df3 = df2[(df2['len_diff']<3) & (df2['len_match']!=0)]
    print 'dataframe shape before and after filtering'
    print df2.shape, df3.shape

    #sort ingredients set for later matching to flavor
    match_ingr = set()
    df3['match ingredients'].map(lambda x: [match_ingr.add(i) for i in x])
    sorted_ingr = sorted(list(match_ingr))
    #create columns for each ingredient
    df4 = df3.copy()
    for item in sorted_ingr:
        df4[item] = df4['match ingredients'].apply(lambda x:item in x)

    df_X = df4.drop(df3.columns, axis=1)

    return df4, df_X

#after direct string matching, catch some spelling differences through this
def tweak_dic(df_dic, diff_df):

    alist = ['chile', 'chili','chilies','chilli','sriracha']
    for pepper in alist:
        for item in filter(lambda x: pepper in x, diff_df):
            if item not in df_dic:
                df_dic[item] = ['tabasco pepper']

    for item in filter(lambda x: 'flour' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['whole grain wheat flour']

    for item in filter(lambda x: 'tumeric' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['turmeric']

    for item in filter(lambda x: 'yoghurt' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['yogurt']

    for item in filter(lambda x: 'sausage' in x, diff_df):
        if item not in df_dic:
            df_dic[item] = ['smoked sausage']

    alist = ['rib','chuck','sirloin','steak']
    for beef in alist:
        for item in filter(lambda x: beef in x, diff_df):
            if item not in df_dic:
                df_dic[item] = ['beef']

    for item in filter(lambda x: 'fillet' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['raw fish']

    for item in filter(lambda x: 'mozzarella' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['mozzarella cheese']

    for item in filter(lambda x: 'spinach' in x, diff_df):
        if item not in df_dic:
                df_dic[item] = ['dried spinach']

    for item in filter(lambda x: 'curry' in x, diff_df):
        if item not in df_dic:
            df_dic[item] = ['coriander','turmeric','cumin','cayenne']

    return df_dic

#Count missing ingredients after matching; salt, sugar, water and oil are not in the flavor network and thus don't count as missing
def count_missing(lst, df_dic):
    cnt = 0
    for item in lst:
        if item in df_dic:
            cnt+=1
        elif 'salt' in item.split():
            cnt+=1
        elif 'sugar' in item.split():
            cnt+=1
        elif 'water' in item.split():
            cnt+=1
        elif 'oil' in item.split():
            cnt+=1

    return len(lst) - cnt

#After making dictionary to map ingredients from yummly recipes to ingredients in the flavor network, this is to replace ingredients in recipes with ingredients in the flavor network
def ingr_replace(lst, df_dic):
    temp = set()
    for item in lst:
        if item in df_dic:
            temp.update(df_dic[item])
    return temp

#using flavor network to project recipes from ingredient matrix to flavor matrix
def flavor_profile(df,ingr,comp,ingr_comp):
    sorted_ingredients = df.columns
    underscore_ingredients=[]
    for item in sorted_ingredients:
        underscore_ingredients.append(item.replace(' ','_'))

    print len(underscore_ingredients), len(sorted_ingredients)

    ingr_total = ingr_comp.join(ingr,how='right',on='# ingredient id')
    ingr_total = ingr_total.join(comp,how='right',on='compound id')

    ingr_pivot = pd.crosstab(ingr_total['ingredient name'],ingr_total['compound id'])
    ingr_flavor = ingr_pivot[ingr_pivot.index.isin(underscore_ingredients)]

    df_flavor = df.values.dot(ingr_flavor.values)
    print df.shape, df_flavor.shape

    return df_flavor

#normalize flavor matrix with tfidf method
def make_tfidf(arr):
    '''input, numpy array with flavor counts for each recipe and compounds
    return numpy array adjusted as tfidf
    '''
    arr2 = arr.copy()
    N=arr2.shape[0]
    l2_rows = np.sqrt(np.sum(arr2**2, axis=1)).reshape(N, 1)
    l2_rows[l2_rows==0]=1
    arr2_norm = arr2/l2_rows

    arr2_freq = np.sum(arr2_norm>0, axis=0)
    arr2_idf = np.log(float(N+1) / (1.0 + arr2_freq)) + 1.0

    from sklearn.preprocessing import normalize
    tfidf = np.multiply(arr2_norm, arr2_idf)
    tfidf = normalize(tfidf, norm='l2', axis=1)
    print tfidf.shape
    return tfidf


if __name__ == '__main__':
    yum = pd.read_pickle('data/yummly.pkl')
    #drop duplicates
    yum = yum.drop_duplicates(['id'], keep='first')
    #drop low ratings
    yum = yum[yum['rating']>2]
    #drop dishes such as dessert and sauce
    yum = yum[yum['course']!='[Desserts]']
    yum = yum[yum['course']!='[Condiments and Sauces]']
    #clean up cuisine labels
    yum['cuisine']= yum['cuisine'].apply(lambda x: x.strip('[]'))

    cuisine_dic = {'Thai, Asian': 'Thai', 'Chinese, Asian':'Chinese', 'Japanese, Asian':'Japanese',
     'Southern & Soul Food, American': 'Southern & Soul Food',
     'Mediterranean, Greek': 'Mediterranean',
     'Cajun & Creole, Southern & Soul Food, American': 'Southern & Soul Food',
     'Asian, Japanese': 'Japanese','Cajun & Creole, American': 'Cajun & Creole',
     'Hawaiian, American': 'Hawaiian', 'Asian, Thai': 'Thai', 'American, Cuban':'Cuban',
     'Greek, Mediterranean': 'Greek', 'Indian, Asian': 'Indian','Asian, Chinese':'Chinese',
     'American, Kid-Friendly': 'American', 'Spanish, Portuguese':'Spanish',
     'Mexican, Southwestern': 'Mexican', 'Southwestern, Mexican': 'Southwestern',
     'American, Southern & Soul Food': 'Southern & Soul Food',
     'Cajun & Creole, Southern & Soul Food': 'Southern & Soul Food',
     'Portuguese, American':'American','American, French': 'American',
     'American, Cajun & Creole':'American',
     'American, Cajun & Creole, Southern & Soul Food': 'American',
     'Irish, American':'American'
        }

    yum['cuisine'] = yum['cuisine'].apply(lambda x: cuisine_dic[x] if x in cuisine_dic else x)
    #remove some cusines with few dishes
    subcuisine = list(yum['cuisine'].value_counts().index[:25])
    yum = yum[yum['cuisine'].isin(subcuisine)]
    #clean up ingredients and create list
    yum['clean ingredients'] = yum['ingredients'].apply(lambda x: split_ingr(x))
    yum['clean ingredients'] = yum['clean ingredients'].apply(lambda x:[remove_word(word) for word in x])
    yum.to_pickle('data/yummly_clean.pkl')

    #make list and set for all ingredients
    yum_lst = list(itertools.chain(*(yum['clean ingredients'].tolist())))
    yum_ingr = set(yum_lst)
    print len(yum_lst), len(yum_ingr)

    #load ingr and comp information for the flavor network
    comp = pd.read_csv('data/comp_info.tsv',index_col=0,sep='\t')
    ingr_comp = pd.read_csv('data/ingr_comp.tsv',sep='\t')
    ingr = pd.read_csv('data/ingr_info.tsv',index_col=0,sep='\t')
    ingr['space ingredients']= ingr['ingredient name'].apply(lambda x: x.replace('_',' ') )
    ingr_ingredients = set()
    ingr['space ingredients'].map(lambda x: ingr_ingredients.add(x))
    print len(ingr_ingredients)
    #clean up ingredients and get two dataframes
    yum_ingr, yum_X = cleanup_ingredients(ingr_ingredients, yum, 'clean ingredients')
    #pickle the dataframe yum_ingr and yum_X
    yum_ingr.to_pickle('data/yummly_ingr.pkl')
    yum_X.to_pickle('data/yummly_ingrX.pkl')
    #get flavor profile
    yum_flavor = flavor_profile(yum_X, ingr, comp, ingr_comp)
    #make tfidf from flavor profile
    yum_tfidf = make_tfidf(yum_flavor)
    #pickle numpy array as dataframes
    pd.DataFrame(yum_flavor).to_pickle('data/yum_flavor.pkl')
    pd.DataFrame(yum_tfidf).to_pickle('data/yum_tfidf.pkl')