package com.github.xsocket.job.parser; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.Calendar; import javax.mail.BodyPart; import javax.mail.Multipart; import javax.mail.Session; import javax.mail.internet.MimeMessage; import org.apache.commons.io.FileUtils; import org.apache.http.client.fluent.Request; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.github.xsocket.job.AbstractResumeParser; import com.github.xsocket.job.Resume; import com.github.xsocket.job.ResumeParser; import com.github.xsocket.job.resume.ZhilianResume; import com.github.xsocket.job.util.QuotedPrintableUtils; /** * 智联邮件简历解析工具。 * @author MWQ * */ public class ZhilianEmailResumeParser extends AbstractResumeParser implements ResumeParser { private static final String COMPANY_SPLIT = " "; private static final String BASIC_INFO_SPLIT = "\\|"; @Override public String getName() { return "智联招聘(邮件)"; } @Override public boolean canParse(File file) { if(file == null) { return false; } else { String name = file.getName(); return (name.contains("Zhaopin.com") || name.startsWith("智联招聘")) && name.endsWith(".eml"); } } @Override public Resume parse(File file) throws Exception { ZhilianResume resume = new ZhilianResume(); String fileName = file.getName(); if(fileName.contains("Zhaopin.com")) { resume.setJob(intercept(fileName, "应聘 ", "-")); } else { // 解析已经解析完成的文件 int start = fileName.lastIndexOf("-") + 1; int end = fileName.lastIndexOf(".eml"); if(start > 0 && end > start) { resume.setJob(fileName.substring(start, end)); } } Document doc = parse2Html(file); /* 解析样例如下的数据: 董文朋 男|4年工作经验|1992年5月|未婚 现居住于北京-通州区|本科|湖北-孝感户口 ..... 其他信息 2014/02 - 2015/05 辽河油田泰利达有限公司 (1年3个月) 。。。。。。 其他信息 2008/09 - 2012/07 北京国际经贸研修学院 信息管理与信息系统 统招 本科 */ Elements sections = doc.select("table table table table tr"); // 下一行要解析的数据类型 String nextSection = "姓名"; for(Element section : sections) { String text = section.text(); if(isNullOrEmpty(text)) { continue; } if("姓名".equals(nextSection)) { resume.setName(text.trim()); nextSection = "综合"; } else if("综合".equals(nextSection)) { parseBasicInfo(resume, text); nextSection = "上家公司"; } else if("上家公司".equals(nextSection)) { // 判断是否是真是的公司信息 if(text.length() < 20) { continue; } else if(text.charAt(4) != '/') { continue; } else if(text.charAt(8) != '-') { continue; } String[] infos = text.split(COMPANY_SPLIT); if(infos.length>=4) { resume.setCompany(infos[3]); } nextSection = "毕业院校"; } else if("毕业院校".equals(nextSection)) { for(String edu : EDUCATIONS.keySet()) { if(text.contains(edu)) { resume.setEducation(edu); String[] infos = text.split(COMPANY_SPLIT); if(infos.length>=4) { resume.setSchool(infos[3]); } nextSection = "结束"; break; } } if("结束".equals(nextSection)) { break; } } } tryFetchContact(resume, doc); return resume; } protected void tryFetchContact(ZhilianResume resume, Document doc) { final String SPLIT1 = "url="; final String SPLIT2 = "ldparam="; Elements as = doc.select("table table table table tr td a"); for(Element elem : as) { String href = elem.attr("href"); if(href.contains(SPLIT2) && href.contains(SPLIT1)) { String url = href.substring(href.lastIndexOf(SPLIT1) + SPLIT1.length(), href.length()); String content; try { content = Request.Get(url).execute().returnContent().asString(); Document doc2 = Jsoup.parse(content); Elements infos = doc2.select("div.login_content p a"); resume.setName(infos.get(0).text()); resume.setPhone(infos.get(1).text()); resume.setMail(infos.get(2).text()); } catch (Exception e) { e.printStackTrace(System.err); } return; } } } protected void parseBasicInfo(ZhilianResume resume, String text) { String[] values = text.split(BASIC_INFO_SPLIT); String next = null; for(String value : values) { String str = value.trim(); if(str.contains("男")) { resume.setSex("男"); next = "工作经验"; } else if(str.contains("女")) { resume.setSex("女"); next = "工作经验"; } else if(str.contains("工作经验")) { int index = str.indexOf("年"); if(index > 0) { resume.setWorkDuration(str.substring(0, index)); } else { resume.setWorkDuration(str.substring(0, str.indexOf("工作经验"))); } next = "生日"; } else if("生日".equals(next)){ int index = str.indexOf("年"); if(index > 0) { resume.setBirthday(str); try { int birthYear = Integer.parseInt(str.substring(0, index)); Integer age = Calendar.getInstance().get(Calendar.YEAR) - birthYear; resume.setAge(age.toString()); } catch(Exception e) { // 解析出生年份失败,无法计算年龄 } } next = "城市"; } else if("城市".equals(next)){ int index = str.indexOf("现居住于"); if(index > 0) { resume.setCity(str.substring(index + 1, str.length())); } next = "学历"; } else if("学历".equals(next)) { if(EDUCATIONS.containsKey(str)) { resume.setEducation(str); } } } } protected Document parse2Html(File file) throws Exception { InputStream in = new FileInputStream(file); Session mailSession = Session.getDefaultInstance(System.getProperties(), null); MimeMessage msg = new MimeMessage(mailSession, in); Multipart part = (Multipart) msg.getContent(); String html = ""; for(int i = 0; i < part.getCount(); i++) { BodyPart body = part.getBodyPart(i); String type = body.getContentType(); if(type.startsWith("text/html")) { html = body.getContent().toString(); break; } } in.close(); if(html == null || html.length() == 0) { String content = FileUtils.readFileToString(file); final String endFlag = "</html>"; int start = content.indexOf("<html"); int end = content.indexOf(endFlag); content = content.substring(start, end + endFlag.length()); html = QuotedPrintableUtils.decode(content.getBytes(), "gb2312"); System.err.println(html); } return Jsoup.parse(html); } }