Java Code Examples for org.jsoup.nodes.Document#outputSettings()

The following examples show how to use org.jsoup.nodes.Document#outputSettings() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
/**
 * Jsoup.parse(in, charsetName, baseUri)
 */
@Override
public Document handle( InputStream input) throws IOException{
	//获取Jsoup参数
	String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	//使用Jsoup将html转换成Document对象
	Document doc = Jsoup.parse(input, charsetName, baseUri);
	
	OutputSettings outputSettings = new OutputSettings();
	
	outputSettings.prettyPrint(false);
	
	/*
	outputSettings.syntax(syntax)
	outputSettings.charset(charset)
	outputSettings*/
	doc.outputSettings(outputSettings);
	
	//返回Document对象
	return doc;
}
 
Example 2
@NotNull static AdditionalInfo handleXML(String xml) {
	AdditionalInfo info = new AdditionalInfo();
	info.setTitle(TITLE);
	Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
	doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
	String text = doc.select("item description").first().html().replace("\r\n", "<br>").trim();
	if (text.startsWith("Zurzeit gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")
			|| text.startsWith("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) {
		info.setHasInformation(false);
		info.setText("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.");
	}
	if (text.endsWith("<br>")) {
		text = text.substring(0, text.length() - 4);
	}
	info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")");
	info.setText(text);

	return info;
}
 
Example 3
@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}
 
Example 4
private static void testHtmlParser(String url) throws Exception {
    Document doc = Jsoup.connect(url).userAgent(USER_AGENT).cookie("auth", "token")
            .timeout(30000).get();
    Charset charset = doc.charset();
    System.out.println("charset = " + charset);
    System.out.println("location = " + doc.location());
    System.out.println("nodeName = " + doc.nodeName());
    Document.OutputSettings outputSettings = doc.outputSettings();
    System.out.println("charset = " + outputSettings.charset());
    System.out.println("indentAmount = " + outputSettings.indentAmount());
    System.out.println("syntax = " + outputSettings.syntax());
    System.out.println("escapeMode = " + outputSettings.escapeMode());
    System.out.println("prettyPrint = " + outputSettings.prettyPrint());
    System.out.println("outline = " + outputSettings.outline());

    System.out.println("title = " + doc.title());
    System.out.println("baseUri = " + doc.baseUri());

    Element head = doc.head();
    Elements children = head.children();
    for(Element child: children) {
        System.out.print(child.tag().getName() + " : ");
        System.out.println(child);
    }
    printElements(doc.body().children());
}
 
Example 5
Source Project: ripme   File: FuraffinityRipper.java    License: MIT License 5 votes vote down vote up
public String getDescription(String page) {
    try {
        // Fetch the image page
        Response resp = Http.url(page)
                .referrer(this.url)
                .response();
        cookies.putAll(resp.cookies());

        // Try to find the description
        Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]");
        if (els.isEmpty()) {
            LOGGER.debug("No description at " + page);
            throw new IOException("No description found");
        }
        LOGGER.debug("Description found!");
        Document documentz = resp.parse();
        Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is.
        // Would break completely if FurAffinity changed site layout.
        documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
        ele.select("br").append("\\n");
        ele.select("p").prepend("\\n\\n");
        LOGGER.debug("Returning description at " + page);
        String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
        return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name.
    } catch (IOException ioe) {
        LOGGER.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
        return null;
    }
}
 
Example 6
Source Project: Shadbot   File: NetUtils.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * @param html The HTML to convert to text with new lines preserved, may be {@code null}.
 * @return The provided HTML converted to text with new lines preserved or {@code null} if null string input.
 */
@Nullable
public static String cleanWithLinebreaks(@Nullable String html) {
    if (html == null || html.isBlank()) {
        return html;
    }
    final Document document = Jsoup.parse(html);
    // Makes html() preserve linebreak and spacing
    document.outputSettings(new Document.OutputSettings().prettyPrint(false));
    document.select("br").append("\\n");
    document.select("p").prepend("\\n\\n");
    final String str = document.html().replace("\\\\n", "\n");
    return Jsoup.clean(str, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}
 
Example 7
Source Project: scava   File: HtmlParser.java    License: Eclipse Public License 2.0 5 votes vote down vote up
private static List<String> parse(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	//System.out.println(input);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<String> textList = new ArrayList<String>();

	readNodes(document.body().childNodes(), textList);
	return textList;
}
 
Example 8
Source Project: scava   File: HtmlParser.java    License: Eclipse Public License 2.0 5 votes vote down vote up
private static List<Map.Entry<String,String>> parseWithTags(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<Map.Entry<String,String>> textListMap = new ArrayList<Map.Entry<String,String>>();

	readNodesWithTags(document.body().childNodes(), textListMap,"body");
	return textListMap;
}
 
Example 9
/**
 * Replace html line breaks and &gt; &lt; entities.
 *
 * @param html
 *            the html
 * @return the string
 */
public static String replaceHtmlLineBreaks(String html) {
	if (html == null)
		return html;
	Document document = Jsoup.parse(html);
	// makes html() preserve linebreaks and spacing
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	String cleanedString = Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
	cleanedString = cleanedString.replaceAll("&gt;", ">");
	cleanedString = cleanedString.replaceAll("&lt;", "<");
	return cleanedString;
}
 
Example 10
Source Project: voj   File: HtmlTextFilter.java    License: GNU General Public License v3.0 5 votes vote down vote up
/**
 * 过滤包含HTML字符串.
 * @param text - 待过滤的字符串
 * @return 过滤后的字符串.
 */
public static String filter(String text) {
	if ( text == null ) {
		return text;
	}
	
	Document document = Jsoup.parse(text);
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}
 
Example 11
Source Project: baleen   File: AbstractHtmlConsumer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final File f = getFileName(jCas);
  final DocumentAnnotation da = getDocumentAnnotation(jCas);

  final Document doc =
      Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
  doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
  final Element head = doc.head();

  if (!Strings.isNullOrEmpty(css)) {
    final Element cssLink = head.appendElement("link");
    cssLink.attr("rel", "stylesheet");
    cssLink.attr("href", css);
  }

  final Element charset = head.appendElement("meta");
  charset.attr("charset", "utf-8");

  appendMeta(head, "document.type", da.getDocType());
  appendMeta(head, "document.sourceUri", da.getSourceUri());
  appendMeta(head, "externalId", da.getHash());

  appendMeta(head, "document.classification", da.getDocumentClassification());
  appendMeta(
      head,
      "document.caveats",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
  appendMeta(
      head,
      "document.releasability",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));

  String title = null;
  for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
    appendMeta(head, md.getKey(), md.getValue());
    if ("documentTitle".equalsIgnoreCase(md.getKey())) {
      title = md.getValue();
    }
  }

  if (!Strings.isNullOrEmpty(title)) {
    doc.title(title);
  }

  final Element body = doc.body();

  writeBody(jCas, body);

  try {
    FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
  } catch (final IOException e) {
    throw new AnalysisEngineProcessException(e);
  }
}