org.jsoup.parser.Parser Java Examples

The following examples show how to use org.jsoup.parser.Parser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: UrlConnectTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void handles200WithNoContent() throws IOException {
    Connection con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .userAgent(browserUa);
    Connection.Response res = con.execute();
    Document doc = res.parse();
    assertEquals(200, res.statusCode());

    con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .parser(Parser.xmlParser())
        .userAgent(browserUa);
    res = con.execute();
    doc = res.parse();
    assertEquals(200, res.statusCode());
}
 
Example #2
Source File: JsoupBasedFormatter.java    From formatter-maven-plugin with Apache License 2.0 6 votes vote down vote up
@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}
 
Example #3
Source File: DataUtilTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test
public void wrongMetaCharsetFallback() {
    try {
        final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
        final ByteBuffer inBuffer = ByteBuffer.wrap(input);
        
        Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
        
        final String expected = "<html>\n" +
                                " <head>\n" +
                                "  <meta charset=\"iso-8\">\n" +
                                " </head>\n" +
                                " <body></body>\n" +
                                "</html>";
        
        assertEquals(expected, doc.toString());
    } catch( UnsupportedEncodingException ex ) {
        fail(ex.getMessage());
    }
}
 
Example #4
Source File: Ch5Coz4.java    From CrawlerPack with Apache License 2.0 6 votes vote down vote up
public static void normalXmlParse(){
    String json = CrawlerPack.getFromRemote(url);
    String xml = CrawlerPack.jsonToXml(json);

    // 原始 json 轉為 xml 的結果
    System.out.println( "原始XML" ) ;
    System.out.println( xml );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    // 發生了什麼事?
    System.out.println( "轉換後XML" ) ;
    System.out.println(jsoupDoc.toString());


}
 
Example #5
Source File: AppShellSettings.java    From flow with Apache License 2.0 6 votes vote down vote up
private Element element(VaadinRequest request) {
    if (content == null) {
        content = BootstrapUtils.getDependencyContents(request, file);
    }

    if (type == Wrapping.AUTOMATIC && file != null) {
        if (file.toLowerCase().endsWith(".css")) {
            type = Wrapping.STYLESHEET;
        } else if (file.toLowerCase().endsWith(".js")) {
            type = Wrapping.JAVASCRIPT;
        }
    }
    if (type == Wrapping.STYLESHEET) {
        return createElement("style", content, "type", "text/css");
    }
    if (type == Wrapping.JAVASCRIPT) {
        return createElement("script", content, "type",
                "text/javascript");
    }
    return Jsoup.parse(content, "", Parser.xmlParser());
}
 
Example #6
Source File: JerryExtractor.java    From web-data-extractor with Apache License 2.0 6 votes vote down vote up
private String parse(String str) {
    Document document = Jsoup.parse(str, "", Parser.xmlParser());
    String result = "";
    switch (outType) {
        case TYPE_TEXT:
            result = document.text();
            break;
        case TYPE_HTML:
            result = document.html();
            break;
        default:
            result = document.text();
            break;
    }
    return result;
}
 
Example #7
Source File: SelectorTest.java    From jsoup-learning with MIT License 6 votes vote down vote up
public static void main(String[] args) {
    String html = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    Document document = parser.parseInput(html, "");
    Elements select = document.select("body div");
    System.out.println(select);
}
 
Example #8
Source File: NicoAudioSourceManager.java    From lavaplayer with Apache License 2.0 6 votes vote down vote up
private AudioTrack loadTrack(String videoId) {
  checkLoggedIn();

  try (HttpInterface httpInterface = getHttpInterface()) {
    try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) {
      int statusCode = response.getStatusLine().getStatusCode();
      if (!HttpClientTools.isSuccessWithContent(statusCode)) {
        throw new IOException("Unexpected response code from video info: " + statusCode);
      }

      Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser());
      return extractTrackFromXml(videoId, document);
    }
  } catch (IOException e) {
    throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e);
  }
}
 
Example #9
Source File: BoxDotComAccount.java    From neembuu-uploader with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Read information about user. Here you can read other important info.
 * @throws Exception 
 */
private void getUserInfo() throws Exception {
    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=

    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk

    NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass());
    httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket);
    httpResponse = httpclient.execute(httpGet, httpContext);
    responseString = EntityUtils.toString(httpResponse.getEntity());
    //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse});
    
    doc = Jsoup.parse(responseString, "", Parser.xmlParser());
    String auth_token = doc.select("response auth_token").text();

    NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token});
    properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token);
}
 
Example #10
Source File: Node.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}
 
Example #11
Source File: DataUtil.java    From jsoup-learning with MIT License 5 votes vote down vote up
/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
    FileInputStream inStream = null;
    try {
        inStream = new FileInputStream(in);
        ByteBuffer byteData = readToByteBuffer(inStream);
        return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
    } finally {
        if (inStream != null)
            inStream.close();
    }
}
 
Example #12
Source File: Element.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example #13
Source File: Element.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children.
 * @param html HTML to add inside this element, after the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element append(String html) {
    Validate.notNull(html);

    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example #14
Source File: RssLoader.java    From android-opensource-library-56 with Apache License 2.0 5 votes vote down vote up
@Override
public RssList loadInBackground() {
    try {

        Document document = Jsoup.connect(this.mFeed.url)
                .parser(Parser.xmlParser()).get();
        parseCssSelector(document);
        // parseDomTraverse(document);

    } catch (Exception e) {
        e.printStackTrace();
    }
    return mList;
}
 
Example #15
Source File: UrlConnectTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}
 
Example #16
Source File: CharsetIdentification.java    From storm-crawler with Apache License 2.0 5 votes vote down vote up
/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    int len = buffer.length;
    if (maxlength > 0 && maxlength < len) {
        len = maxlength;
    }
    String html = new String(buffer, 0, len, DEFAULT_CHARSET);

    String foundCharset = null;

    try {
        Document doc = Parser.htmlParser().parseInput(html, "dummy");

        // look for <meta http-equiv="Content-Type"
        // content="text/html;charset=gb2312"> or HTML5 <meta
        // charset="gb2312">
        Elements metaElements = doc
                .select("meta[http-equiv=content-type], meta[charset]");
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta
                        .attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                return foundCharset;
        }
    } catch (Exception e) {
        foundCharset = null;
    }

    return foundCharset;
}
 
Example #17
Source File: UrlConnectTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}
 
Example #18
Source File: DataUtilTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
    String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
    ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
    Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
    assertEquals("One", doc.head().text());
    assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}
 
Example #19
Source File: Cleaner.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}
 
Example #20
Source File: HttpConnection.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}
 
Example #21
Source File: TextExtractorTest.java    From storm-crawler with Apache License 2.0 5 votes vote down vote up
@Test
public void testExclusionCase() throws IOException {
    Config conf = new Config();
    conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");

    TextExtractor extractor = new TextExtractor(conf);

    String content = "<html>the<STYLE>main</STYLE>content of the page</html>";

    Document jsoupDoc = Parser.htmlParser().parseInput(content,
            "http://stormcrawler.net");
    String text = extractor.text(jsoupDoc.body());

    assertEquals("the content of the page", text);
}
 
Example #22
Source File: Element.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example #23
Source File: Element.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example #24
Source File: Node.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}
 
Example #25
Source File: DataUtilTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void wrongMetaCharsetFallback() throws IOException {
    String html = "<html><head><meta charset=iso-8></head><body></body></html>";

    Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser());

    final String expected = "<html>\n" +
        " <head>\n" +
        "  <meta charset=\"iso-8\">\n" +
        " </head>\n" +
        " <body></body>\n" +
        "</html>";

    assertEquals(expected, doc.toString());
}
 
Example #26
Source File: DataUtilTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=euc-kr\">" +
            "</head><body>한국어</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser());

    assertEquals("한국어", doc.body().text());
}
 
Example #27
Source File: DataUtilTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test
public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=koi8-u\">" +
            "</head><body>Übergrößenträger</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser());

    assertEquals("Übergrößenträger", doc.body().text());
}
 
Example #28
Source File: Cleaner.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}
 
Example #29
Source File: HttpConnection.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}
 
Example #30
Source File: Node.java    From jsoup-learning with MIT License 5 votes vote down vote up
private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;        
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}