org.htmlcleaner.CleanerProperties Java Examples

The following examples show how to use org.htmlcleaner.CleanerProperties. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();

    cleanerProperties.setAdvancedXmlEscape(true);

    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);

    cleanerProperties.setTranslateSpecialEntities(true);
    cleanerProperties.setTransResCharsToNCR(true);
    cleanerProperties.setRecognizeUnicodeChars(true);

    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);

    cleanerProperties.setPruneTags("script,title");

    return result;
}
 
Example #2
Source File: UtilsStaticAnalyzer.java    From apogen with Apache License 2.0 5 votes vote down vote up
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}
 
Example #3
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 5 votes vote down vote up
protected XmlSerializer makeSerializer() {
  CleanerProperties configuration = getConfiguration();
  return new CompactXmlSerializer(
    (configuration != null) ?
      configuration : getParser().getProperties()
  );
}
 
Example #4
Source File: XmlUtils.java    From iaf with Apache License 2.0 5 votes vote down vote up
public static String toXhtml(String htmlString) {
	String xhtmlString = null;
	if (StringUtils.isNotEmpty(htmlString)) {
		xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
		if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) {
			CleanerProperties props = new CleanerProperties();
			HtmlCleaner cleaner = new HtmlCleaner(props);
			TagNode tagNode = cleaner.clean(xhtmlString);
			xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode);
		}
	}
	return xhtmlString;
}
 
Example #5
Source File: MagnetWServiceModelImp.java    From AndroidMagnetSearch with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #6
Source File: MagnetWServiceModelImp.java    From AndroidDownload with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #7
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
public CleanerProperties getConfiguration() {
  if (configuration == null)
    configuration = makeConfiguration();
  return configuration;
}
 
Example #8
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
public void setConfiguration(CleanerProperties configuration) {
  this.configuration = configuration;
}
 
Example #9
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
protected CleanerProperties makeConfiguration() {
  return new CleanerProperties();
}