java source code of HtmlUtils

/*
 * Power by www.xiaoi.com
 */
package com.eastrobot.doc.util;

import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UniversalDetector;

/**
 * @author <a href="mailto:[email protected]">eko.zhan</a>
 * @date 2017年8月9日 下午8:30:29
 * @version 1.0
 */
public class HtmlUtils {
	
	public static final String UTF8 = "UTF-8";
	
	//TODO 是否有思考 excel 和 ppt 转换后的头部？
	public static final String HEAD_TEMPLATE = 
		"<!DOCTYPE HTML/>" +
		"<HTML><HEAD>" + 
			"<META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf-8\">" +
			"<STYLE TYPE=\"text/css\">" +
				"@page { margin-left: 2cm; margin-right: 2cm; margin-top: 1.5cm; margin-bottom: 1.2cm }" +
				"P { text-indent: 0.35cm; margin-bottom: 0.21cm; direction: ltr; color: #000000; line-height: 150%; text-align: justify; widows: 2; orphans: 2 }" +
				"P.western { font-family: \"Calibri\", sans-serif; font-size: 10pt; so-language: en-US }" +
				"P.cjk { font-family: \"华文细黑\", \"微软雅黑\"; font-size: 10pt; so-language: zh-CN }" +
				"P.ctl { font-family: \"Times New Roman\", serif; font-size: 11pt; so-language: ar-SA }" +
				"A:link { color: #0000ff; text-decoration: none }" +
			"</STYLE>" +
		"</HEAD><BODY LANG=\"zh-CN\" TEXT=\"#000000\" LINK=\"#0000ff\" DIR=\"LTR\">";
	
	public static final String FOOT_TEMPLATE = "</BODY></HTML>";
	
	/** 
	 * http://blog.csdn.net/huweijun_2012/article/details/51900814
     * 替换指定标签的属性和值 
     * @param str 需要处理的字符串 
     * @param tag 标签名称 
     * @param tagAttrib 要替换的标签属性值 
     * @param startTag 新标签开始标记 
     * @param endTag  新标签结束标记 
     * @return 
     * @author huweijun 
     * @date 2016年7月13日 下午7:15:32 
     */
	public static String replaceHtmlTag(String str, String tag, String tagAttrib, String startTag, String endTag) {  
        String regxpForTag = "<\\s*" + tag + "\\s+([^>]*)\\s*" ;  
        String regxpForTagAttrib = tagAttrib + "=\\s*\"([^\"]+)\"" ;  
        Pattern patternForTag = Pattern.compile (regxpForTag,Pattern. CASE_INSENSITIVE );  
        Pattern patternForAttrib = Pattern.compile (regxpForTagAttrib,Pattern. CASE_INSENSITIVE );     
        Matcher matcherForTag = patternForTag.matcher(str);  
        StringBuffer sb = new StringBuffer();  
        boolean result = matcherForTag.find();  
        while (result) {  
            StringBuffer sbreplace = new StringBuffer( "<"+tag+" ");  
            Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag.group(1));  
            if (matcherForAttrib.find()) {  
                String attributeStr = matcherForAttrib.group(1);  
                matcherForAttrib.appendReplacement(sbreplace, startTag + attributeStr + endTag);  
            }  
            matcherForAttrib.appendTail(sbreplace);  
            matcherForTag.appendReplacement(sb, sbreplace.toString());  
            result = matcherForTag.find();  
        }  
        matcherForTag.appendTail(sb);           
        return sb.toString();  
    }
	
	/**
	 * 将 data 中的编码修改为 utf-8
	 * @author eko.zhan at 2017年8月11日 上午9:54:34
	 * @param data
	 * @return
	 */
	public static String replaceCharset(String data){
		return StringUtils.replaceAll(data, "(?i)CONTENT=\"text/html; charset=gb2312\"", "CONTENT=\"text/html; charset=utf-8\"");
	}
	
	/**
	 * 获取文件编码
	 * @author eko.zhan at Jul 3, 2017 1:54:50 PM
	 * @param file
	 * @return
	 * @throws IOException 
	 */
	public static String getFileEncoding(File file) throws IOException{
		UniversalDetector detector = new UniversalDetector(null);
		byte[] bytes = FileUtils.readFileToByteArray(file);
		detector.handleData(bytes, 0, bytes.length);
		detector.dataEnd();
		return detector.getDetectedCharset();
	}
}