from nltk import ngrams


def ngram_toks(sents, n=1):
    ntoks =[]
    for sent in sents:
        ntok = list(ngrams(sent.split(), n))
        newtoks = [tok for tok in ntok]
        ntoks+= newtoks
    return ntoks



def ndistinct(sents, n=1):
    total_tokens = ngram_toks(sents, n)

    unique_toks = set(total_tokens)
    tlen = len(total_tokens)
    ulen = float(len(unique_toks))
    return ulen/tlen

def avg_nd(all_sents, n=1):
    scores=0
    t = len(all_sents)
    for sents in all_sents:
        scores+= (ndistinct(sents,n) / t)

    return scores


if __name__ == '__main__':
    sent= []
    sent.append('what is best way to make money online' )
    sent.append('what should i do to make money online' )
    sent.append('what should i do to earn money online' )
    sent.append('what is the easiest way to make money online' )
    sent.append('what is the easiest way to earn money online' )
    sent.append('what s the easiest way to make money online' )
    sent.append('what s the easiest way to earn money online' )
    sent.append('what should i do to make money online online' )
    sent.append('what is the best way to make money online' )
    sent.append('what is the easiest way to make money online online' )

    for i in range(1,8):
        print('Number of {}-distinct ngrams: {}'.format(i, ndistinct(sent,i)))