org.jsoup.parser.Parser Java Examples
The following examples show how to use
org.jsoup.parser.Parser.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: lavaplayer Author: sedmelluq File: NicoAudioSourceManager.java License: Apache License 2.0 | 6 votes |
private AudioTrack loadTrack(String videoId) { checkLoggedIn(); try (HttpInterface httpInterface = getHttpInterface()) { try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) { int statusCode = response.getStatusLine().getStatusCode(); if (!HttpClientTools.isSuccessWithContent(statusCode)) { throw new IOException("Unexpected response code from video info: " + statusCode); } Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser()); return extractTrackFromXml(videoId, document); } } catch (IOException e) { throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e); } }
Example #2
Source Project: formatter-maven-plugin Author: revelc File: JsoupBasedFormatter.java License: Apache License 2.0 | 6 votes |
@Override public String doFormat(String code, LineEnding ending) { Document document; switch (formatter.syntax()) { case html: document = Jsoup.parse(code, "", Parser.htmlParser()); break; case xml: document = Jsoup.parse(code, "", Parser.xmlParser()); break; default: throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax"); } document.outputSettings(formatter); String formattedCode = document.outerHtml(); if (code.equals(formattedCode)) { return null; } return formattedCode; }
Example #3
Source Project: CrawlerPack Author: abola File: Ch5Coz4.java License: Apache License 2.0 | 6 votes |
public static void normalXmlParse(){ String json = CrawlerPack.getFromRemote(url); String xml = CrawlerPack.jsonToXml(json); // 原始 json 轉為 xml 的結果 System.out.println( "原始XML" ) ; System.out.println( xml ); Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); // 發生了什麼事? System.out.println( "轉換後XML" ) ; System.out.println(jsoupDoc.toString()); }
Example #4
Source Project: flow Author: vaadin File: AppShellSettings.java License: Apache License 2.0 | 6 votes |
private Element element(VaadinRequest request) { if (content == null) { content = BootstrapUtils.getDependencyContents(request, file); } if (type == Wrapping.AUTOMATIC && file != null) { if (file.toLowerCase().endsWith(".css")) { type = Wrapping.STYLESHEET; } else if (file.toLowerCase().endsWith(".js")) { type = Wrapping.JAVASCRIPT; } } if (type == Wrapping.STYLESHEET) { return createElement("style", content, "type", "text/css"); } if (type == Wrapping.JAVASCRIPT) { return createElement("script", content, "type", "text/javascript"); } return Jsoup.parse(content, "", Parser.xmlParser()); }
Example #5
Source Project: web-data-extractor Author: fivesmallq File: JerryExtractor.java License: Apache License 2.0 | 6 votes |
private String parse(String str) { Document document = Jsoup.parse(str, "", Parser.xmlParser()); String result = ""; switch (outType) { case TYPE_TEXT: result = document.text(); break; case TYPE_HTML: result = document.html(); break; default: result = document.text(); break; } return result; }
Example #6
Source Project: neembuu-uploader Author: Neembuu-Uploader File: BoxDotComAccount.java License: GNU General Public License v3.0 | 6 votes |
/** * Read information about user. Here you can read other important info. * @throws Exception */ private void getUserInfo() throws Exception { //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket= //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass()); httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket); httpResponse = httpclient.execute(httpGet, httpContext); responseString = EntityUtils.toString(httpResponse.getEntity()); //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse}); doc = Jsoup.parse(responseString, "", Parser.xmlParser()); String auth_token = doc.select("response auth_token").text(); NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token}); properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token); }
Example #7
Source Project: jsoup-learning Author: code4craft File: SelectorTest.java License: MIT License | 6 votes |
public static void main(String[] args) { String html = "<body>\n" + " <textarea>\n" + " <!-- Text -->\n" + " xxx\n" + " </textarea> \n" + " <div> \n" + " <table> \n" + " <!-- InTable --> \n" + " <!-- InTableText --> xxx \n" + " <tbody> \n" + " <tr> \n" + " <!-- InRow --> \n" + " <td> \n" + " <!-- InCell --> </td> \n" + " </tr> \n" + " </tbody> \n" + " </table> \n" + " </div> \n" + "</body>"; Parser parser = Parser.htmlParser(); Document document = parser.parseInput(html, ""); Elements select = document.select("body div"); System.out.println(select); }
Example #8
Source Project: astor Author: SpoonLabs File: DataUtilTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void wrongMetaCharsetFallback() { try { final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8"); final ByteBuffer inBuffer = ByteBuffer.wrap(input); Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser()); final String expected = "<html>\n" + " <head>\n" + " <meta charset=\"iso-8\">\n" + " </head>\n" + " <body></body>\n" + "</html>"; assertEquals(expected, doc.toString()); } catch( UnsupportedEncodingException ex ) { fail(ex.getMessage()); } }
Example #9
Source Project: astor Author: SpoonLabs File: UrlConnectTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void handles200WithNoContent() throws IOException { Connection con = Jsoup .connect("http://direct.infohound.net/tools/200-no-content.pl") .userAgent(browserUa); Connection.Response res = con.execute(); Document doc = res.parse(); assertEquals(200, res.statusCode()); con = Jsoup .connect("http://direct.infohound.net/tools/200-no-content.pl") .parser(Parser.xmlParser()) .userAgent(browserUa); res = con.execute(); doc = res.parse(); assertEquals(200, res.statusCode()); }
Example #10
Source Project: astor Author: SpoonLabs File: UrlConnectTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { // should auto-detect xml and use XML parser, unless explicitly requested the html parser String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); Document doc = con.get(); Connection.Request req = con.request(); assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); }
Example #11
Source Project: jsoup-learning Author: code4craft File: ParserCorrectorTest.java License: MIT License | 5 votes |
public static void main(String[] args) { String htmlWithDivUnclosed = "<body>\n" + " <textarea>\n" + " <!-- Text -->\n" + " xxx\n" + " </textarea> \n" + " <div> \n" + " <div>\n" + " <table> \n" + " <!-- InTable --> \n" + " <!-- InTableText --> xxx \n" + " <tbody> \n" + " <tr> \n" + " <!-- InRow --> \n" + " <td> \n" + " <!-- InCell --> </td> \n" + " </tr> \n" + " </tbody> \n" + " </table> \n" + " </div> \n" + "</body>"; Parser parser = Parser.htmlParser(); parser.setTrackErrors(100); Document document = parser.parseInput(htmlWithDivUnclosed, ""); List<ParseError> errors = parser.getErrors(); System.out.println(errors); }
Example #12
Source Project: ankihelper Author: mmjang File: YoudaoOnline.java License: GNU General Public License v3.0 | 5 votes |
static public YoudaoResult getDefinition(String key) throws IOException{ // Document doc = Jsoup.connect(String.format(BASE_URL, key.trim())) // .userAgent("Mozilla") // .cookie("auth", "token") // .timeout(2000) // .parser(Parser.xmlParser()) // .get(); //doc.toString(); Request request = new Request.Builder().url(String.format(BASE_URL, key.trim())).build(); String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string(); Document doc = Jsoup.parse(rawhtml, "", Parser.xmlParser()); String phonetic = getSingleQueryResult(doc, "phonetic-symbol"); String returnPhrase = getSingleQueryResult(doc, "return-phrase"); List<String> translation = new ArrayList<String>(); for(Element e : doc.select("translation > content")){ translation.add(e.text()); } Map<String, List<String>> webTranslation = new LinkedHashMap<>(); for(Element web : doc.select("web-translation")){ String keyString = getSingleQueryResult(web, "key"); List<String> values = new ArrayList<>(); for(Element value : web.select("trans > value")){ String valueString = value.text().trim(); values.add(valueString); } webTranslation.put(keyString, values); } YoudaoResult youdaoResult = new YoudaoResult(); youdaoResult.phonetic = phonetic; youdaoResult.returnPhrase = returnPhrase; youdaoResult.translation = translation; youdaoResult.webTranslation = webTranslation; return youdaoResult; }
Example #13
Source Project: substitution-schedule-parser Author: vertretungsplanme File: IndiwareMobileParser.java License: Mozilla Public License 2.0 | 5 votes |
@NotNull private List<String> parseClasses(String filePrefix) throws JSONException, IOException, CredentialInvalidException { String baseurl = data.getString(PARAM_BASEURL) + "/"; HttpResponseException lastException = null; for (int i = -4; i < MAX_DAYS; i++) { LocalDate date = LocalDate.now().plusDays(i); String dateStr = DateTimeFormat.forPattern("yyyyMMdd").print(date); String url = baseurl + "mobdaten/" + filePrefix + dateStr + ".xml?_=" + System.currentTimeMillis(); try { String xml = httpGet(url, "UTF-8"); Document doc = Jsoup.parse(xml, url, Parser.xmlParser()); List<String> classes = new ArrayList<>(); for (Element klasse:doc.select("Klassen > Kl")) { classes.add(klasse.select("Kurz").first().text()); } return classes; } catch (HttpResponseException e) { lastException = e; } } if (lastException != null) { throw lastException; } else { return new ArrayList<>(); } }
Example #14
Source Project: jsoup-learning Author: code4craft File: PageErrorChecker.java License: MIT License | 5 votes |
public static List<ParseError> check(String url) throws IOException { Parser parser = Parser.htmlParser(); parser.setTrackErrors(100); String body = Jsoup.connect(url).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36") .execute().body(); parser.parseInput(body, url); List<ParseError> errors = parser.getErrors(); return errors; }
Example #15
Source Project: substitution-schedule-parser Author: vertretungsplanme File: IndiwareDemoTest.java License: Mozilla Public License 2.0 | 5 votes |
@Test public void testEquals() throws IOException, JSONException { SubstitutionScheduleDay scheduleXML = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()), false); SubstitutionScheduleDay scheduleHTML = parser.parseIndiwareDay(Jsoup.parse(html), true); assertEquals(scheduleXML, scheduleHTML); }
Example #16
Source Project: CrawlerPack Author: abola File: CrawlerPack.java License: Apache License 2.0 | 5 votes |
/** * 將 HTML 轉化為 Jsoup Document 物件 * * HTML的內容就使用Jsoup原生的 HTML Parser * * @param html Html document * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document htmlToJsoupDoc(String html){ // 將 html(html/html5) 轉為 jsoup Document 物件 Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser() ); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
Example #17
Source Project: CrawlerPack Author: abola File: CrawlerPack.java License: Apache License 2.0 | 5 votes |
/** * 將 XML 轉化為 Jsoup Document 物件 * * Jsoup 1.9.1+ supported non-ascii tag * ----- * 如果碰到Tag 名稱首字元非 a-zA-Z 的字元,jsoup 會解析為註解 * 所以必需用騙的先置入 prefix * 再改寫xmlParse 在回傳時移除prefix * * @param xml XML format string * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document xmlToJsoupDoc(String xml){ // Tag 首字元非 a-zA-Z 時轉化為註解的問題 //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>") // .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>"); // 將 xml 轉為 jsoup Document 物件 //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) ); Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser() ); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
Example #18
Source Project: astor Author: SpoonLabs File: Node.java License: GNU General Public License v2.0 | 5 votes |
private void addSiblingHtml(int index, String html) { Validate.notNull(html); Validate.notNull(parentNode); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> nodes = Parser.parseFragment(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); }
Example #19
Source Project: jsoup-learning Author: code4craft File: HttpConnection.java License: MIT License | 5 votes |
private Request() { timeoutMilliseconds = 3000; maxBodySizeBytes = 1024 * 1024; // 1MB followRedirects = true; data = new ArrayList<Connection.KeyVal>(); method = Connection.Method.GET; headers.put("Accept-Encoding", "gzip"); parser = Parser.htmlParser(); }
Example #20
Source Project: astor Author: SpoonLabs File: HttpConnection.java License: GNU General Public License v2.0 | 5 votes |
private Request() { timeoutMilliseconds = 3000; maxBodySizeBytes = 1024 * 1024; // 1MB followRedirects = true; data = new ArrayList<Connection.KeyVal>(); method = Method.GET; headers.put("Accept-Encoding", "gzip"); parser = Parser.htmlParser(); }
Example #21
Source Project: storm-crawler Author: DigitalPebble File: CharsetIdentification.java License: Apache License 2.0 | 5 votes |
/** * Attempt to find a META tag in the HTML that hints at the character set * used to write the document. */ private static String getCharsetFromMeta(byte buffer[], int maxlength) { // convert to UTF-8 String -- which hopefully will not mess up the // characters we're interested in... int len = buffer.length; if (maxlength > 0 && maxlength < len) { len = maxlength; } String html = new String(buffer, 0, len, DEFAULT_CHARSET); String foundCharset = null; try { Document doc = Parser.htmlParser().parseInput(html, "dummy"); // look for <meta http-equiv="Content-Type" // content="text/html;charset=gb2312"> or HTML5 <meta // charset="gb2312"> Elements metaElements = doc .select("meta[http-equiv=content-type], meta[charset]"); for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) foundCharset = getCharsetFromContentType(meta .attr("content")); if (foundCharset == null && meta.hasAttr("charset")) foundCharset = meta.attr("charset"); if (foundCharset != null) return foundCharset; } } catch (Exception e) { foundCharset = null; } return foundCharset; }
Example #22
Source Project: android-opensource-library-56 Author: android-opensource-library-56 File: RssLoader.java License: Apache License 2.0 | 5 votes |
@Override public RssList loadInBackground() { try { Document document = Jsoup.connect(this.mFeed.url) .parser(Parser.xmlParser()).get(); parseCssSelector(document); // parseDomTraverse(document); } catch (Exception e) { e.printStackTrace(); } return mList; }
Example #23
Source Project: astor Author: SpoonLabs File: Element.java License: GNU General Public License v2.0 | 5 votes |
/** * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. * @param html HTML to add inside this element, before the existing HTML * @return this element * @see #html(String) */ public Element prepend(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(0, nodes.toArray(new Node[nodes.size()])); return this; }
Example #24
Source Project: jsoup-learning Author: code4craft File: DataUtil.java License: MIT License | 5 votes |
/** * Loads a file to a Document. * @param in file to load * @param charsetName character set of input * @param baseUri base URI of document, to resolve relative links against * @return Document * @throws IOException on IO error */ public static Document load(File in, String charsetName, String baseUri) throws IOException { FileInputStream inStream = null; try { inStream = new FileInputStream(in); ByteBuffer byteData = readToByteBuffer(inStream); return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser()); } finally { if (inStream != null) inStream.close(); } }
Example #25
Source Project: astor Author: SpoonLabs File: Element.java License: GNU General Public License v2.0 | 5 votes |
/** * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. * @param html HTML to add inside this element, after the existing HTML * @return this element * @see #html(String) */ public Element append(String html) { Validate.notNull(html); List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(nodes.toArray(new Node[nodes.size()])); return this; }
Example #26
Source Project: astor Author: SpoonLabs File: UrlConnectTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException { // should auto-detect xml and use XML parser, unless explicitly requested the html parser String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml"; Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser()); Document doc = con.get(); Connection.Request req = con.request(); assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder); assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml())); }
Example #27
Source Project: astor Author: SpoonLabs File: DataUtilTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() { String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>"; ByteBuffer buffer = Charset.forName("UTF-8").encode(html); Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser()); assertEquals("One", doc.head().text()); assertEquals("UTF-8", doc.outputSettings().charset().displayName()); }
Example #28
Source Project: astor Author: SpoonLabs File: Cleaner.java License: GNU General Public License v2.0 | 5 votes |
public boolean isValidBodyHtml(String bodyHtml) { Document clean = Document.createShell(""); Document dirty = Document.createShell(""); ParseErrorList errorList = ParseErrorList.tracking(1); List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList); dirty.body().insertChildren(0, nodes); int numDiscarded = copySafeNodes(dirty.body(), clean.body()); return numDiscarded == 0 && errorList.size() == 0; }
Example #29
Source Project: astor Author: SpoonLabs File: HttpConnection.java License: GNU General Public License v2.0 | 5 votes |
Request() { timeoutMilliseconds = 30000; // 30 seconds maxBodySizeBytes = 1024 * 1024; // 1MB followRedirects = true; data = new ArrayList<>(); method = Method.GET; addHeader("Accept-Encoding", "gzip"); addHeader(USER_AGENT, DEFAULT_UA); parser = Parser.htmlParser(); }
Example #30
Source Project: storm-crawler Author: DigitalPebble File: TextExtractorTest.java License: Apache License 2.0 | 5 votes |
@Test public void testExclusionCase() throws IOException { Config conf = new Config(); conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<STYLE>main</STYLE>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("the content of the page", text); }