import jieba, codecs, sys, pandas import numpy as np from wordcloud import WordCloud from scipy.misc import imread from wordcloud import WordCloud, ImageColorGenerator from os import listdir from os.path import isfile, join stopwords_filename = 'data/stopwords.txt' font_filename = 'fonts/STFangSong.ttf' template_dir = 'data/templates/' def main(input_filename): content = '\n'.join([line.strip() for line in codecs.open(input_filename, 'r', 'utf-8') if len(line.strip()) > 0]) stopwords = set([line.strip() for line in codecs.open(stopwords_filename, 'r', 'utf-8')]) segs = jieba.cut(content) words = [] for seg in segs: word = seg.strip().lower() if len(word) > 1 and word not in stopwords: words.append(word) words_df = pandas.DataFrame({'word':words}) words_stat = words_df.groupby(by=['word'])['word'].agg({'number' : np.size}) words_stat = words_stat.reset_index().sort_values(by="number",ascending=False) print '# of different words =', len(words_stat) input_prefix = input_filename if input_filename.find('.') != -1: input_prefix = '.'.join(input_filename.split('.')[:-1]) for file in listdir(template_dir): if file[-4:] != '.png' and file[-4:] != '.jpg': continue background_picture_filename = join(template_dir, file) if isfile(background_picture_filename): prefix = file.split('.')[0] bimg=imread(background_picture_filename) wordcloud=WordCloud(font_path=font_filename,background_color='white',mask = bimg,max_font_size=600,random_state=100) wordcloud=wordcloud.fit_words(dict(words_stat.head(4000).itertuples(index=False))) bimgColors=ImageColorGenerator(bimg) wordcloud.recolor(color_func=bimgColors) output_filename = prefix + '_' + input_prefix + '.png' print 'Saving', output_filename wordcloud.to_file(output_filename) if __name__ == '__main__': if len(sys.argv) == 2: main(sys.argv[1]) else: print '[usage] <input>'