org.jsoup.parser.Parser Java Examples

The following examples show how to use org.jsoup.parser.Parser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void handles200WithNoContent() throws IOException {
    Connection con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .userAgent(browserUa);
    Connection.Response res = con.execute();
    Document doc = res.parse();
    assertEquals(200, res.statusCode());

    con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .parser(Parser.xmlParser())
        .userAgent(browserUa);
    res = con.execute();
    doc = res.parse();
    assertEquals(200, res.statusCode());
}

Example #2

Source File: JsoupBasedFormatter.java From formatter-maven-plugin with Apache License 2.0

6 votes

@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}

Example #3

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void wrongMetaCharsetFallback() {
    try {
        final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
        final ByteBuffer inBuffer = ByteBuffer.wrap(input);
        
        Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
        
        final String expected = "<html>\n" +
                                " <head>\n" +
                                "  <meta charset=\"iso-8\">\n" +
                                " </head>\n" +
                                " <body></body>\n" +
                                "</html>";
        
        assertEquals(expected, doc.toString());
    } catch( UnsupportedEncodingException ex ) {
        fail(ex.getMessage());
    }
}

Example #4

Source File: Ch5Coz4.java From CrawlerPack with Apache License 2.0

6 votes

public static void normalXmlParse(){
    String json = CrawlerPack.getFromRemote(url);
    String xml = CrawlerPack.jsonToXml(json);

    // 原始 json 轉為 xml 的結果
    System.out.println( "原始XML" ) ;
    System.out.println( xml );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    // 發生了什麼事？
    System.out.println( "轉換後XML" ) ;
    System.out.println(jsoupDoc.toString());


}

Example #5

Source File: AppShellSettings.java From flow with Apache License 2.0

6 votes

private Element element(VaadinRequest request) {
    if (content == null) {
        content = BootstrapUtils.getDependencyContents(request, file);
    }

    if (type == Wrapping.AUTOMATIC && file != null) {
        if (file.toLowerCase().endsWith(".css")) {
            type = Wrapping.STYLESHEET;
        } else if (file.toLowerCase().endsWith(".js")) {
            type = Wrapping.JAVASCRIPT;
        }
    }
    if (type == Wrapping.STYLESHEET) {
        return createElement("style", content, "type", "text/css");
    }
    if (type == Wrapping.JAVASCRIPT) {
        return createElement("script", content, "type",
                "text/javascript");
    }
    return Jsoup.parse(content, "", Parser.xmlParser());
}

Example #6

Source File: JerryExtractor.java From web-data-extractor with Apache License 2.0

6 votes

private String parse(String str) {
    Document document = Jsoup.parse(str, "", Parser.xmlParser());
    String result = "";
    switch (outType) {
        case TYPE_TEXT:
            result = document.text();
            break;
        case TYPE_HTML:
            result = document.html();
            break;
        default:
            result = document.text();
            break;
    }
    return result;
}

Example #7

Source File: SelectorTest.java From jsoup-learning with MIT License

6 votes

public static void main(String[] args) {
    String html = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    Document document = parser.parseInput(html, "");
    Elements select = document.select("body div");
    System.out.println(select);
}

Example #8

Source File: NicoAudioSourceManager.java From lavaplayer with Apache License 2.0

6 votes

private AudioTrack loadTrack(String videoId) {
  checkLoggedIn();

  try (HttpInterface httpInterface = getHttpInterface()) {
    try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) {
      int statusCode = response.getStatusLine().getStatusCode();
      if (!HttpClientTools.isSuccessWithContent(statusCode)) {
        throw new IOException("Unexpected response code from video info: " + statusCode);
      }

      Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser());
      return extractTrackFromXml(videoId, document);
    }
  } catch (IOException e) {
    throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e);
  }
}

Example #9

Source File: BoxDotComAccount.java From neembuu-uploader with GNU General Public License v3.0

6 votes

/**
 * Read information about user. Here you can read other important info.
 * @throws Exception 
 */
private void getUserInfo() throws Exception {
    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=

    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk

    NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass());
    httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket);
    httpResponse = httpclient.execute(httpGet, httpContext);
    responseString = EntityUtils.toString(httpResponse.getEntity());
    //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse});
    
    doc = Jsoup.parse(responseString, "", Parser.xmlParser());
    String auth_token = doc.select("response auth_token").text();

    NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token});
    properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token);
}

Example #10

Source File: Node.java From astor with GNU General Public License v2.0

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}

Example #11

Source File: DataUtil.java From jsoup-learning with MIT License

5 votes

/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
    FileInputStream inStream = null;
    try {
        inStream = new FileInputStream(in);
        ByteBuffer byteData = readToByteBuffer(inStream);
        return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
    } finally {
        if (inStream != null)
            inStream.close();
    }
}

Example #12

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Example #13

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children.
 * @param html HTML to add inside this element, after the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element append(String html) {
    Validate.notNull(html);

    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(nodes.toArray(new Node[nodes.size()]));
    return this;
}

Example #14

Source File: RssLoader.java From android-opensource-library-56 with Apache License 2.0

5 votes

@Override
public RssList loadInBackground() {
    try {

        Document document = Jsoup.connect(this.mFeed.url)
                .parser(Parser.xmlParser()).get();
        parseCssSelector(document);
        // parseDomTraverse(document);

    } catch (Exception e) {
        e.printStackTrace();
    }
    return mList;
}

Example #15

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

Example #16

Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0

5 votes

/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    int len = buffer.length;
    if (maxlength > 0 && maxlength < len) {
        len = maxlength;
    }
    String html = new String(buffer, 0, len, DEFAULT_CHARSET);

    String foundCharset = null;

    try {
        Document doc = Parser.htmlParser().parseInput(html, "dummy");

        // look for <meta http-equiv="Content-Type"
        // content="text/html;charset=gb2312"> or HTML5 <meta
        // charset="gb2312">
        Elements metaElements = doc
                .select("meta[http-equiv=content-type], meta[charset]");
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta
                        .attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                return foundCharset;
        }
    } catch (Exception e) {
        foundCharset = null;
    }

    return foundCharset;
}

Example #17

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

Example #18

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
    String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
    ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
    Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
    assertEquals("One", doc.head().text());
    assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}

Example #19

Source File: Cleaner.java From astor with GNU General Public License v2.0

5 votes

public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}

Example #20

Source File: HttpConnection.java From astor with GNU General Public License v2.0

5 votes

Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}

Example #21

Source File: TextExtractorTest.java From storm-crawler with Apache License 2.0

5 votes

@Test
public void testExclusionCase() throws IOException {
    Config conf = new Config();
    conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");

    TextExtractor extractor = new TextExtractor(conf);

    String content = "<html>the<STYLE>main</STYLE>content of the page</html>";

    Document jsoupDoc = Parser.htmlParser().parseInput(content,
            "http://stormcrawler.net");
    String text = extractor.text(jsoupDoc.body());

    assertEquals("the content of the page", text);
}

Example #22

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Example #23

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Example #24

Source File: Node.java From astor with GNU General Public License v2.0

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}

Example #25

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void wrongMetaCharsetFallback() throws IOException {
    String html = "<html><head><meta charset=iso-8></head><body></body></html>";

    Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser());

    final String expected = "<html>\n" +
        " <head>\n" +
        "  <meta charset=\"iso-8\">\n" +
        " </head>\n" +
        " <body></body>\n" +
        "</html>";

    assertEquals(expected, doc.toString());
}

Example #26

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=euc-kr\">" +
            "</head><body>한국어</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser());

    assertEquals("한국어", doc.body().text());
}

Example #27

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=koi8-u\">" +
            "</head><body>Übergrößenträger</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser());

    assertEquals("Übergrößenträger", doc.body().text());
}

Example #28

Source File: Cleaner.java From astor with GNU General Public License v2.0

5 votes

public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}

Example #29

Source File: HttpConnection.java From astor with GNU General Public License v2.0

5 votes

Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}

Example #30

Source File: Node.java From jsoup-learning with MIT License

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;        
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}