import email
import os
import html2text

os.chdir('C:/Users/Ernest/Desktop/corpus')

h2t = html2text.HTML2Text()
h2t.body_width=20000

def text_parser(path):
    with open(path) as eml:
        m = email.message_from_file(eml)

    if m.get_content_type!='mixed':
        for m in m.walk():
            if m.get_content_subtype()=='plain':
                try:
                    text = str(m.get_payload(decode=True),encoding='utf-8')
                except:
                    text = str(m.get_payload(decode=True),encoding='gbk')
                text = text.replace('--\n发自我的网易邮箱平板适配版','')
                text = text.split('----------------')[0]
                text = text.strip()
                return text
            if m.get_content_subtype()=='html':
                try:
                    text = str(m.get_payload(decode=True),encoding='utf-8')
                except:
                    text = str(m.get_payload(decode=True),encoding='gbk')
                text = h2t.handle(text)
                text = text.strip()
                text = text.replace('  ','')
                return text

for root, dirs, files in os.walk("."):
    for file_name in files:
        if file_name.endswith('.eml'):
            path = os.path.join(root,file_name)
            try:
                text = text_parser(path)
                text = text.replace(u'\u202f','')
                text = text.replace('\n ','')
                text = text.splitlines()
                text = list(filter(None, text))
                for ed in text:
                    if 'Editor:' in ed or 'Editors:' in ed:
                        text = text[:text.index(ed)+1]
                        text = '\n'.join(text)
                        with open(path.replace('.eml', '')+'.txt', 'w') as txt:
                            txt.write(text)
            except:
                print('fuck:',path)
                break
        print('\r','已完成:{:.2f}%'.format(round((files.index(file_name)+1)*100/len(files))), end='', flush=True)