package org.jsoup.parser; import org.jsoup.Jsoup; import org.jsoup.TextUtil; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.nodes.XmlDeclaration; import org.junit.Ignore; import org.junit.Test; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.util.List; import static org.jsoup.nodes.Document.OutputSettings.Syntax; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; /** * Tests XmlTreeBuilder. * * @author Jonathan Hedley */ public class XmlTreeBuilderTest { @Test public void testSimpleXmlParse() { String xml = "<doc id=2 href='/bar'>Foo <br /><link>One</link><link>Two</link></doc>"; XmlTreeBuilder tb = new XmlTreeBuilder(); Document doc = tb.parse(xml, "http://foo.com/"); assertEquals("<doc id=\"2\" href=\"/bar\">Foo <br /><link>One</link><link>Two</link></doc>", TextUtil.stripNewlines(doc.html())); assertEquals(doc.getElementById("2").absUrl("href"), "http://foo.com/bar"); } @Test public void testPopToClose() { // test: </val> closes Two, </bar> ignored String xml = "<doc><val>One<val>Two</val></bar>Three</doc>"; XmlTreeBuilder tb = new XmlTreeBuilder(); Document doc = tb.parse(xml, "http://foo.com/"); assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(doc.html())); } @Test public void testCommentAndDocType() { String xml = "<!DOCTYPE HTML><!-- a comment -->One <qux />Two"; XmlTreeBuilder tb = new XmlTreeBuilder(); Document doc = tb.parse(xml, "http://foo.com/"); assertEquals("<!DOCTYPE HTML><!-- a comment -->One <qux />Two", TextUtil.stripNewlines(doc.html())); } @Test public void testSupplyParserToJsoupClass() { String xml = "<doc><val>One<val>Two</val></bar>Three</doc>"; Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser()); assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(doc.html())); } @Ignore @Test public void testSupplyParserToConnection() throws IOException { String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml"; // parse with both xml and html parser, ensure different Document xmlDoc = Jsoup.connect(xmlUrl).parser(Parser.xmlParser()).get(); Document htmlDoc = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()).get(); Document autoXmlDoc = Jsoup.connect(xmlUrl).get(); // check connection auto detects xml, uses xml parser assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(xmlDoc.html())); assertFalse(htmlDoc.equals(xmlDoc)); assertEquals(xmlDoc, autoXmlDoc); assertEquals(1, htmlDoc.select("head").size()); // html parser normalises assertEquals(0, xmlDoc.select("head").size()); // xml parser does not assertEquals(0, autoXmlDoc.select("head").size()); // xml parser does not } @Test public void testSupplyParserToDataStream() throws IOException, URISyntaxException { File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-test.xml").toURI()); InputStream inStream = new FileInputStream(xmlFile); Document doc = Jsoup.parse(inStream, null, "http://foo.com", Parser.xmlParser()); assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(doc.html())); } @Test public void testDoesNotForceSelfClosingKnownTags() { // html will force "<br>one</br>" to logically "<br />One<br />". XML should be stay "<br>one</br> -- don't recognise tag. Document htmlDoc = Jsoup.parse("<br>one</br>"); assertEquals("<br>one\n<br>", htmlDoc.body().html()); Document xmlDoc = Jsoup.parse("<br>one</br>", "", Parser.xmlParser()); assertEquals("<br>one</br>", xmlDoc.html()); } @Test public void handlesXmlDeclarationAsDeclaration() { String html = "<?xml encoding='UTF-8' ?><body>One</body><!-- comment -->"; Document doc = Jsoup.parse(html, "", Parser.xmlParser()); assertEquals("<?xml encoding=\"UTF-8\"?> <body> One </body> <!-- comment -->", StringUtil.normaliseWhitespace(doc.outerHtml())); assertEquals("#declaration", doc.childNode(0).nodeName()); assertEquals("#comment", doc.childNode(2).nodeName()); } @Test public void xmlFragment() { String xml = "<one src='/foo/' />Two<three><four /></three>"; List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/"); assertEquals(3, nodes.size()); assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src")); assertEquals("one", nodes.get(0).nodeName()); assertEquals("Two", ((TextNode)nodes.get(1)).text()); } @Test public void xmlParseDefaultsToHtmlOutputSyntax() { Document doc = Jsoup.parse("x", "", Parser.xmlParser()); assertEquals(Syntax.xml, doc.outputSettings().syntax()); } @Test public void testDoesHandleEOFInTag() { String html = "<img src=asdf onerror=\"alert(1)\" x="; Document xmlDoc = Jsoup.parse(html, "", Parser.xmlParser()); assertEquals("<img src=\"asdf\" onerror=\"alert(1)\" x=\"\" />", xmlDoc.html()); } @Test public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntaxException { File xmlFile = new File(XmlTreeBuilder.class.getResource("/htmltests/xml-charset.xml").toURI()); InputStream inStream = new FileInputStream(xmlFile); Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser()); assertEquals("ISO-8859-1", doc.charset().name()); assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>", TextUtil.stripNewlines(doc.html())); } @Test public void testParseDeclarationAttributes() { String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); XmlDeclaration decl = (XmlDeclaration) doc.childNode(0); assertEquals("1", decl.attr("version")); assertEquals("UTF-8", decl.attr("encoding")); assertEquals("else", decl.attr("something")); assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration()); assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml()); } @Test public void caseSensitiveDeclaration() { String xml = "<?XML version='1' encoding='UTF-8' something='else'?>"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); assertEquals("<?XML version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", doc.outerHtml()); } @Test public void testCreatesValidProlog() { Document document = Document.createShell(""); document.outputSettings().syntax(Syntax.xml); document.charset(Charset.forName("utf-8")); assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<html>\n" + " <head></head>\n" + " <body></body>\n" + "</html>", document.outerHtml()); } @Test public void preservesCaseByDefault() { String xml = "<TEST ID=1>Check</TEST>"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); assertEquals("<TEST ID=\"1\">Check</TEST>", TextUtil.stripNewlines(doc.html())); } @Test public void canNormalizeCase() { String xml = "<TEST ID=1>Check</TEST>"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser().settings(ParseSettings.htmlDefault)); assertEquals("<test id=\"1\">Check</test>", TextUtil.stripNewlines(doc.html())); } }