Java Code Examples for org.jsoup.parser.Parser

The following examples show how to use org.jsoup.parser.Parser. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lavaplayer   Source File: NicoAudioSourceManager.java    License: Apache License 2.0 6 votes vote down vote up
private AudioTrack loadTrack(String videoId) {
  checkLoggedIn();

  try (HttpInterface httpInterface = getHttpInterface()) {
    try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) {
      int statusCode = response.getStatusLine().getStatusCode();
      if (!HttpClientTools.isSuccessWithContent(statusCode)) {
        throw new IOException("Unexpected response code from video info: " + statusCode);
      }

      Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser());
      return extractTrackFromXml(videoId, document);
    }
  } catch (IOException e) {
    throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e);
  }
}
 
Example 2
@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}
 
Example 3
Source Project: CrawlerPack   Source File: Ch5Coz4.java    License: Apache License 2.0 6 votes vote down vote up
public static void normalXmlParse(){
    String json = CrawlerPack.getFromRemote(url);
    String xml = CrawlerPack.jsonToXml(json);

    // 原始 json 轉為 xml 的結果
    System.out.println( "原始XML" ) ;
    System.out.println( xml );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    // 發生了什麼事?
    System.out.println( "轉換後XML" ) ;
    System.out.println(jsoupDoc.toString());


}
 
Example 4
Source Project: flow   Source File: AppShellSettings.java    License: Apache License 2.0 6 votes vote down vote up
private Element element(VaadinRequest request) {
    if (content == null) {
        content = BootstrapUtils.getDependencyContents(request, file);
    }

    if (type == Wrapping.AUTOMATIC && file != null) {
        if (file.toLowerCase().endsWith(".css")) {
            type = Wrapping.STYLESHEET;
        } else if (file.toLowerCase().endsWith(".js")) {
            type = Wrapping.JAVASCRIPT;
        }
    }
    if (type == Wrapping.STYLESHEET) {
        return createElement("style", content, "type", "text/css");
    }
    if (type == Wrapping.JAVASCRIPT) {
        return createElement("script", content, "type",
                "text/javascript");
    }
    return Jsoup.parse(content, "", Parser.xmlParser());
}
 
Example 5
Source Project: web-data-extractor   Source File: JerryExtractor.java    License: Apache License 2.0 6 votes vote down vote up
private String parse(String str) {
    Document document = Jsoup.parse(str, "", Parser.xmlParser());
    String result = "";
    switch (outType) {
        case TYPE_TEXT:
            result = document.text();
            break;
        case TYPE_HTML:
            result = document.html();
            break;
        default:
            result = document.text();
            break;
    }
    return result;
}
 
Example 6
/**
 * Read information about user. Here you can read other important info.
 * @throws Exception 
 */
private void getUserInfo() throws Exception {
    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=

    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk

    NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass());
    httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket);
    httpResponse = httpclient.execute(httpGet, httpContext);
    responseString = EntityUtils.toString(httpResponse.getEntity());
    //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse});
    
    doc = Jsoup.parse(responseString, "", Parser.xmlParser());
    String auth_token = doc.select("response auth_token").text();

    NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token});
    properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token);
}
 
Example 7
Source Project: jsoup-learning   Source File: SelectorTest.java    License: MIT License 6 votes vote down vote up
public static void main(String[] args) {
    String html = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    Document document = parser.parseInput(html, "");
    Elements select = document.select("body div");
    System.out.println(select);
}
 
Example 8
Source Project: astor   Source File: DataUtilTest.java    License: GNU General Public License v2.0 6 votes vote down vote up
@Test
public void wrongMetaCharsetFallback() {
    try {
        final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
        final ByteBuffer inBuffer = ByteBuffer.wrap(input);
        
        Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
        
        final String expected = "<html>\n" +
                                " <head>\n" +
                                "  <meta charset=\"iso-8\">\n" +
                                " </head>\n" +
                                " <body></body>\n" +
                                "</html>";
        
        assertEquals(expected, doc.toString());
    } catch( UnsupportedEncodingException ex ) {
        fail(ex.getMessage());
    }
}
 
Example 9
Source Project: astor   Source File: UrlConnectTest.java    License: GNU General Public License v2.0 6 votes vote down vote up
@Test
public void handles200WithNoContent() throws IOException {
    Connection con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .userAgent(browserUa);
    Connection.Response res = con.execute();
    Document doc = res.parse();
    assertEquals(200, res.statusCode());

    con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .parser(Parser.xmlParser())
        .userAgent(browserUa);
    res = con.execute();
    doc = res.parse();
    assertEquals(200, res.statusCode());
}
 
Example 10
Source Project: astor   Source File: UrlConnectTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}
 
Example 11
Source Project: jsoup-learning   Source File: ParserCorrectorTest.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
    String htmlWithDivUnclosed = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            " <div>\n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    parser.setTrackErrors(100);
    Document document = parser.parseInput(htmlWithDivUnclosed, "");
    List<ParseError> errors = parser.getErrors();
    System.out.println(errors);

}
 
Example 12
Source Project: ankihelper   Source File: YoudaoOnline.java    License: GNU General Public License v3.0 5 votes vote down vote up
static public YoudaoResult getDefinition(String key) throws IOException{
//            Document doc = Jsoup.connect(String.format(BASE_URL, key.trim()))
//                    .userAgent("Mozilla")
//                    .cookie("auth", "token")
//                    .timeout(2000)
//                    .parser(Parser.xmlParser())
//                    .get();
            //doc.toString();
            Request request = new Request.Builder().url(String.format(BASE_URL, key.trim())).build();
            String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
            Document doc = Jsoup.parse(rawhtml, "", Parser.xmlParser());
            String phonetic = getSingleQueryResult(doc, "phonetic-symbol");
            String returnPhrase = getSingleQueryResult(doc, "return-phrase");
            List<String> translation = new ArrayList<String>();
            for(Element e : doc.select("translation > content")){
                translation.add(e.text());
            }

            Map<String, List<String>> webTranslation = new LinkedHashMap<>();
            for(Element web : doc.select("web-translation")){
                String keyString = getSingleQueryResult(web, "key");
                List<String> values = new ArrayList<>();
                for(Element value : web.select("trans > value")){
                    String valueString = value.text().trim();
                    values.add(valueString);
                }
                webTranslation.put(keyString, values);
            }
        YoudaoResult youdaoResult = new YoudaoResult();
        youdaoResult.phonetic = phonetic;
        youdaoResult.returnPhrase = returnPhrase;
        youdaoResult.translation = translation;
        youdaoResult.webTranslation = webTranslation;
        return  youdaoResult;
    }
 
Example 13
@NotNull private List<String> parseClasses(String filePrefix)
        throws JSONException, IOException, CredentialInvalidException {
    String baseurl = data.getString(PARAM_BASEURL) + "/";
    HttpResponseException lastException = null;
    for (int i = -4; i < MAX_DAYS; i++) {
        LocalDate date = LocalDate.now().plusDays(i);
        String dateStr = DateTimeFormat.forPattern("yyyyMMdd").print(date);
        String url = baseurl + "mobdaten/" + filePrefix + dateStr + ".xml?_=" + System.currentTimeMillis();
        try {
            String xml = httpGet(url, "UTF-8");
            Document doc = Jsoup.parse(xml, url, Parser.xmlParser());

            List<String> classes = new ArrayList<>();
            for (Element klasse:doc.select("Klassen > Kl")) {
                classes.add(klasse.select("Kurz").first().text());
            }
            return classes;
        } catch (HttpResponseException e) {
            lastException = e;
        }
    }
    if (lastException != null) {
        throw lastException;
    } else {
        return new ArrayList<>();
    }
}
 
Example 14
Source Project: jsoup-learning   Source File: PageErrorChecker.java    License: MIT License 5 votes vote down vote up
public static List<ParseError> check(String url) throws IOException {
    Parser parser = Parser.htmlParser();
    parser.setTrackErrors(100);
    String body = Jsoup.connect(url).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36")
            .execute().body();
    parser.parseInput(body, url);
    List<ParseError> errors = parser.getErrors();
    return errors;
}
 
Example 15
@Test
public void testEquals() throws IOException, JSONException {
    SubstitutionScheduleDay scheduleXML = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()),
            false);
    SubstitutionScheduleDay scheduleHTML = parser.parseIndiwareDay(Jsoup.parse(html), true);
    assertEquals(scheduleXML, scheduleHTML);
}
 
Example 16
Source Project: CrawlerPack   Source File: CrawlerPack.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * 將 HTML 轉化為 Jsoup Document 物件
 *
 * HTML的內容就使用Jsoup原生的 HTML Parser
 *
 * @param html Html document
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document htmlToJsoupDoc(String html){

    // 將 html(html/html5) 轉為 jsoup Document 物件
    Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser() );
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}
 
Example 17
Source Project: CrawlerPack   Source File: CrawlerPack.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * 將 XML 轉化為 Jsoup Document 物件
 *
 * Jsoup 1.9.1+ supported non-ascii tag
 * -----
 * 如果碰到Tag 名稱首字元非 a-zA-Z 的字元,jsoup 會解析為註解
 * 所以必需用騙的先置入 prefix
 * 再改寫xmlParse 在回傳時移除prefix
 *
 * @param xml XML format string
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document xmlToJsoupDoc(String xml){

    // Tag 首字元非 a-zA-Z 時轉化為註解的問題
    //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>")
    //         .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>");

    // 將 xml 轉為 jsoup Document 物件
    //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser() );
    jsoupDoc.charset(StandardCharsets.UTF_8);

    return jsoupDoc;
}
 
Example 18
Source Project: astor   Source File: Node.java    License: GNU General Public License v2.0 5 votes vote down vote up
private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}
 
Example 19
Source Project: jsoup-learning   Source File: HttpConnection.java    License: MIT License 5 votes vote down vote up
private Request() {
    timeoutMilliseconds = 3000;
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<Connection.KeyVal>();
    method = Connection.Method.GET;
    headers.put("Accept-Encoding", "gzip");
    parser = Parser.htmlParser();
}
 
Example 20
Source Project: astor   Source File: HttpConnection.java    License: GNU General Public License v2.0 5 votes vote down vote up
private Request() {
    timeoutMilliseconds = 3000;
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<Connection.KeyVal>();
    method = Method.GET;
    headers.put("Accept-Encoding", "gzip");
    parser = Parser.htmlParser();
}
 
Example 21
Source Project: storm-crawler   Source File: CharsetIdentification.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    int len = buffer.length;
    if (maxlength > 0 && maxlength < len) {
        len = maxlength;
    }
    String html = new String(buffer, 0, len, DEFAULT_CHARSET);

    String foundCharset = null;

    try {
        Document doc = Parser.htmlParser().parseInput(html, "dummy");

        // look for <meta http-equiv="Content-Type"
        // content="text/html;charset=gb2312"> or HTML5 <meta
        // charset="gb2312">
        Elements metaElements = doc
                .select("meta[http-equiv=content-type], meta[charset]");
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta
                        .attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                return foundCharset;
        }
    } catch (Exception e) {
        foundCharset = null;
    }

    return foundCharset;
}
 
Example 22
Source Project: android-opensource-library-56   Source File: RssLoader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RssList loadInBackground() {
    try {

        Document document = Jsoup.connect(this.mFeed.url)
                .parser(Parser.xmlParser()).get();
        parseCssSelector(document);
        // parseDomTraverse(document);

    } catch (Exception e) {
        e.printStackTrace();
    }
    return mList;
}
 
Example 23
Source Project: astor   Source File: Element.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example 24
Source Project: jsoup-learning   Source File: DataUtil.java    License: MIT License 5 votes vote down vote up
/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
    FileInputStream inStream = null;
    try {
        inStream = new FileInputStream(in);
        ByteBuffer byteData = readToByteBuffer(inStream);
        return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
    } finally {
        if (inStream != null)
            inStream.close();
    }
}
 
Example 25
Source Project: astor   Source File: Element.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children.
 * @param html HTML to add inside this element, after the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element append(String html) {
    Validate.notNull(html);

    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(nodes.toArray(new Node[nodes.size()]));
    return this;
}
 
Example 26
Source Project: astor   Source File: UrlConnectTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}
 
Example 27
Source Project: astor   Source File: DataUtilTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
    String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
    ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
    Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
    assertEquals("One", doc.head().text());
    assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}
 
Example 28
Source Project: astor   Source File: Cleaner.java    License: GNU General Public License v2.0 5 votes vote down vote up
public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}
 
Example 29
Source Project: astor   Source File: HttpConnection.java    License: GNU General Public License v2.0 5 votes vote down vote up
Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}
 
Example 30
Source Project: storm-crawler   Source File: TextExtractorTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testExclusionCase() throws IOException {
    Config conf = new Config();
    conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");

    TextExtractor extractor = new TextExtractor(conf);

    String content = "<html>the<STYLE>main</STYLE>content of the page</html>";

    Document jsoupDoc = Parser.htmlParser().parseInput(content,
            "http://stormcrawler.net");
    String text = extractor.text(jsoupDoc.body());

    assertEquals("the content of the page", text);
}