Java Code Examples for org.jsoup.Jsoup
The following examples show how to use
org.jsoup.Jsoup. These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: superword Source File: WordClassifierForOxford.java License: Apache License 2.0 | 6 votes |
public static String getContent(String word) { String url = OXFORD + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000); LOGGER.debug("url:"+url); Connection conn = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Referer", REFERER) .header("Host", HOST) .header("User-Agent", USER_AGENT) .timeout(60000) .ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); }catch (Exception e){ //LOGGER.error("获取URL:"+url+"页面出错", e); LOGGER.error("获取URL:"+url+"页面出错"); } return html; }
Example 2
Source Project: astor Source File: ElementTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void testAppendTo() { String parentHtml = "<div class='a'></div>"; String childHtml = "<div class='b'></div><p>Two</p>"; Document parentDoc = Jsoup.parse(parentHtml); Element parent = parentDoc.body(); Document childDoc = Jsoup.parse(childHtml); Element div = childDoc.select("div").first(); Element p = childDoc.select("p").first(); Element appendTo1 = div.appendTo(parent); assertEquals(div, appendTo1); Element appendTo2 = p.appendTo(div); assertEquals(p, appendTo2); assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html()); assertEquals("", childDoc.body().html()); // got moved out }
Example 3
Source Project: UpdogFarmer Source File: SteamWebHandler.java License: GNU General Public License v3.0 | 6 votes |
/** * Unlock Steam parental controls with a pin */ private String unlockParental(String pin) { final String url = STEAM_STORE + "parental/ajaxunlock"; try { final Map<String,String> responseCookies = Jsoup.connect(url) .referrer(STEAM_STORE) .followRedirects(true) .ignoreContentType(true) .cookies(generateWebCookies()) .data("pin", pin) .method(Connection.Method.POST) .execute() .cookies(); return responseCookies.get("steamparental"); } catch (Exception e) { e.printStackTrace(); } return null; }
Example 4
Source Project: astor Source File: ElementTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void testClone() { Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>"); Element p = doc.select("p").get(1); Element clone = p.clone(); assertNull(clone.parent()); // should be orphaned assertEquals(0, clone.siblingIndex); assertEquals(1, p.siblingIndex); assertNotNull(p.parent()); clone.append("<span>Three"); assertEquals("<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml())); assertEquals("<div><p>One</p><p><span>Two</span></p></div>", TextUtil.stripNewlines(doc.body().html())); // not modified doc.body().appendChild(clone); // adopt assertNotNull(clone.parent()); assertEquals("<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(doc.body().html())); }
Example 5
Source Project: meizhi Source File: AlbumRequest.java License: Apache License 2.0 | 6 votes |
@Override protected Response<List<Image>> parseNetworkResponse(NetworkResponse response) { try { List<Image> images = new ArrayList<>(); Document document = Jsoup.parse(new String(response.data, HttpHeaderParser.parseCharset(response.headers))); for (Element img : document.select(".container.main .box.show-box img")) { String url = img.attr("src"); if (TextUtils.isEmpty(url)) { continue; } Image image = new Image(); image.url = url; images.add(image); } return Response.success(images, HttpHeaderParser.parseCacheHeaders(response)); } catch (UnsupportedEncodingException e) { return Response.error(new ParseError(e)); } }
Example 6
Source Project: TrackRay Source File: FuckBroDomain.java License: GNU General Public License v3.0 | 6 votes |
public Map<String,String> aizhanIcp(String domain){ HashMap<String, String> map = new HashMap<>(); HttpClient httpClient = new HttpClient(); String url = "https://icp.aizhan.com/%s/"; try { ResponseStatus responseStatus = httpClient.get(String.format(url, domain)); String html = responseStatus.getContent(); if (!html.contains("未找到") && html.contains("该单位备案网站") && html.contains("缓存于")) { Document doc = Jsoup.parse(html); Elements trs = doc.select("div#company .table-s1 tbody tr"); for (Element tr : trs) { String title = tr.select("td").get(1).text(); String dom = tr.select("td").get(2).text(); map.put(dom,title); } } } catch (Exception e) { task.getExceptions().add(e); } SysLog.info("ICP反查结束"); return map; }
Example 7
Source Project: bbs Source File: TextFilterManage.java License: GNU Affero General Public License v3.0 | 6 votes |
/** * 读取上传图片路径名称 * @param html * @param item 项目 * @return */ public List<String> readImageName(String html,String item) { //上传图片文件名称 List<String> imageNameList = new ArrayList<String>(); if(!StringUtils.isBlank(html)){ Document doc = Jsoup.parseBodyFragment(html); //图片 Elements image_elements = doc.select("img[src]"); for (Element element : image_elements) { String imageUrl = element.attr("src"); if(StringUtils.startsWithIgnoreCase(imageUrl, "file/"+item+"/")){ imageNameList.add(imageUrl); } } } return imageNameList; }
Example 8
Source Project: nju-lib-downloader Source File: Book.java License: GNU General Public License v3.0 | 6 votes |
public List<Node> getOutline() throws IOException { for (int i = 0; i < 20; i++) { try { String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF"; //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871 String result = MyHttpRequest.get(url, null, "UTF-8", 3000); result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue(); Document doc = Jsoup.parse(result); Elements elements = doc.select("ul[id=directoryTree]"); return parseUL(elements.get(0)); } catch (Exception e) { if (i == 19) { throw e; } } } return null; }
Example 9
Source Project: warnings-ng-plugin Source File: SourcePrinterTest.java License: MIT License | 6 votes |
@Test void shouldCreateSourceWithoutLineNumber() { SourcePrinter printer = new SourcePrinter(); IssueBuilder builder = new IssueBuilder(); Issue issue = builder.build(); Document document = Jsoup.parse(printer.render(asStream("format-java.txt"), issue, NO_DESCRIPTION, ICON_URL)); String expectedFile = toString("format-java.txt"); assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile); Elements pre = document.getElementsByTag("pre"); assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile); }
Example 10
Source Project: astor Source File: ElementTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void testRemoveBeforeIndex() { Document doc = Jsoup.parse( "<html><body><div><p>before1</p><p>before2</p><p>XXX</p><p>after1</p><p>after2</p></div></body></html>", ""); Element body = doc.select("body").first(); Elements elems = body.select("p:matchesOwn(XXX)"); Element xElem = elems.first(); Elements beforeX = xElem.parent().getElementsByIndexLessThan(xElem.elementSiblingIndex()); for(Element p : beforeX) { p.remove(); } assertEquals("<body><div><p>XXX</p><p>after1</p><p>after2</p></div></body>", TextUtil.stripNewlines(body.outerHtml())); }
Example 11
Source Project: astor Source File: CleanerTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void testIsValidBodyHtml() { String ok = "<p>Test <b><a href='http://example.com/' rel='nofollow'>OK</a></b></p>"; String ok1 = "<p>Test <b><a href='http://example.com/'>OK</a></b></p>"; // missing enforced is OK because still needs run thru cleaner String nok1 = "<p><script></script>Not <b>OK</b></p>"; String nok2 = "<p align=right>Test Not <b>OK</b></p>"; String nok3 = "<!-- comment --><p>Not OK</p>"; // comments and the like will be cleaned String nok4 = "<html><head>Foo</head><body><b>OK</b></body></html>"; // not body html String nok5 = "<p>Test <b><a href='http://example.com/' rel='nofollowme'>OK</a></b></p>"; String nok6 = "<p>Test <b><a href='http://example.com/'>OK</b></p>"; // missing close tag String nok7 = "</div>What"; assertTrue(Jsoup.isValid(ok, Whitelist.basic())); assertTrue(Jsoup.isValid(ok1, Whitelist.basic())); assertFalse(Jsoup.isValid(nok1, Whitelist.basic())); assertFalse(Jsoup.isValid(nok2, Whitelist.basic())); assertFalse(Jsoup.isValid(nok3, Whitelist.basic())); assertFalse(Jsoup.isValid(nok4, Whitelist.basic())); assertFalse(Jsoup.isValid(nok5, Whitelist.basic())); assertFalse(Jsoup.isValid(nok6, Whitelist.basic())); assertFalse(Jsoup.isValid(ok, Whitelist.none())); assertFalse(Jsoup.isValid(nok7, Whitelist.basic())); }
Example 12
Source Project: superword Source File: IPUtils.java License: Apache License 2.0 | 6 votes |
public static List<String> getIPLocation(String ip){ List<String> locations = new ArrayList<>(); try { Elements elements = Jsoup .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000) .select("ul li"); for(Element element : elements){ String text = element.text(); if(StringUtils.isNotBlank(text)){ String[] attrs = text.split(":"); if(attrs != null && attrs.length == 2){ locations.add(attrs[1]); } } } }catch (Exception e){ LOG.error("获取IP地址的地理位置", e); } return locations; }
Example 13
Source Project: Pixiv-Illustration-Collection-Backend Source File: NewService.java License: Apache License 2.0 | 6 votes |
private void pullACGMHNews() throws IOException, InterruptedException { HttpRequest request = HttpRequest.newBuilder() .uri(URI.create("https://www.acgmh.com/category/news")).POST(HttpRequest.BodyPublishers.ofString("type=catL3&paged=1")).build(); String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body(); //ACGMHNewsDTO acgmhNewsDTO = objectMapper.readValue(body, ACGMHNewsDTO.class); Document doc = Jsoup.parse(body); Elements elements = doc.getElementsByClass("pos-r pd10 post-list box mar10-b content"); List<ACGNew> acgNewList = elements.stream().map(e -> { String style = e.getElementsByClass("preview thumb-in").get(0).attr("style"); String cover = style.substring(style.indexOf("('") + 2, style.length() - 2); String author = e.getElementsByClass("users").text(); String createDate = e.getElementsByClass("timeago").text(); Elements es = e.getElementsByClass("entry-title"); String title = es.text(); String refererUrl = es.get(0).getElementsByTag("a").get(0).attr("href"); String intro = e.getElementsByClass("mar10-b post-ex mar10-t mobile-hide").text(); return new ACGNew(title, intro, author, cover, refererUrl, LocalDate.parse(createDate.substring(0, 10)), NewsCrawlerConstant.ACGMH); }).collect(Collectors.toList()); process(acgNewList, "id", "content-innerText"); }
Example 14
Source Project: NLIWOD Source File: CDTClassifierEvaluation.java License: GNU Affero General Public License v3.0 | 6 votes |
public static ArrayList<String> loadSystemR(String system){ Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html"); ArrayList<String> result = Lists.newArrayList(); try{ String loadedData = Files.lines(datapath).collect(Collectors.joining()); Document doc = Jsoup.parse(loadedData); Element table = doc.select("table").get(5); Elements tableRows = table.select("tr"); for(Element row: tableRows){ Elements tableEntry = row.select("td"); result.add(tableEntry.get(1).ownText()); } result.remove(0); //remove the head of the table return result; }catch(IOException e){ e.printStackTrace(); log.debug("loading failed."); return result; } }
Example 15
Source Project: astor Source File: ParseTest.java License: GNU General Public License v2.0 | 6 votes |
@Test public void testNewsHomepage() throws IOException { File in = getFile("/htmltests/news-com-au-home.html"); Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/"); assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title()); assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim()); Element a = doc.select("a[href=/entertainment/horoscopes]").first(); assertEquals("/entertainment/horoscopes", a.attr("href")); assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href")); Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first(); assertEquals( "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003", hs.attr("href")); assertEquals(hs.attr("href"), hs.attr("abs:href")); }
Example 16
Source Project: astor Source File: SelectorTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void testById() { Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("Foo two!", els.get(1).text()); Elements none = Jsoup.parse("<div id=1></div>").select("#foo"); assertEquals(0, none.size()); }
Example 17
Source Project: astor Source File: DocumentTest.java License: GNU General Public License v2.0 | 5 votes |
@Ignore @Test public void testOverflowClone() { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 100000; i++) { builder.insert(0, "<i>"); builder.append("</i>"); } Document doc = Jsoup.parse(builder.toString()); doc.clone(); }
Example 18
Source Project: neembuu-uploader Source File: MegaIron.java License: GNU General Public License v3.0 | 5 votes |
private void initialize() throws Exception { responseString = NUHttpClientUtils.getData("http://megairon.net/", httpContext); doc = Jsoup.parse(responseString); uploadURL = doc.select("form[name=file]").attr("action"); srv_tmp_url = doc.select("form[name=file]").select("input[name=srv_tmp_url]").attr("value"); sessionID = doc.select("form[name=file]").select("input[name=sess_id]").attr("value"); }
Example 19
Source Project: JavaSkype Source File: WebConnector.java License: MIT License | 5 votes |
private Response sendRequest(Method method, String apiPath, boolean absoluteApiPath, String... keyval) throws IOException { String url = absoluteApiPath ? apiPath : SERVER_HOSTNAME + apiPath; Connection conn = Jsoup.connect(url).maxBodySize(100 * 1024 * 1024).timeout(10000).method(method).ignoreContentType(true).ignoreHttpErrors(true); logger.finest("Sending " + method + " request at " + url); if (skypeToken != null) { conn.header("X-Skypetoken", skypeToken); } else { logger.fine("No token sent for the request at: " + url); } conn.data(keyval); return conn.execute(); }
Example 20
Source Project: NovaGuilds Source File: PermissionInReadmeTest.java License: GNU General Public License v3.0 | 5 votes |
@Test public void testPermissionInReadme() throws Exception { final List<Permission> fromReadme = new ArrayList<>(); Document document = Jsoup.parse(new File("./README.md"), "UTF-8"); Element tableBody = document.getElementById("permissions-table").child(1); for(Element tr : tableBody.children()) { String node = tr.child(0).text(); fromReadme.add(Permission.fromPath(node)); } int count = 0; for(Permission permission : Permission.values()) { if(!fromReadme.contains(permission)) { if(count == 0) { System.out.println("Found missing permissions:"); } System.out.println(permission.name()); count++; } } if(count > 0) { throw new Exception("There are " + count + " missing permissions in README.md"); } else { System.out.println("All permissions are present in README.md"); } }
Example 21
Source Project: guanggoo-android Source File: CommentTask.java License: Apache License 2.0 | 5 votes |
@Override public void run() { String xsrf = getXsrf(); Map<String, String> headers = new HashMap<>(); headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); headers.put("Content-Type", "application/x-www-form-urlencoded"); Map<String, String> datas = new HashMap<>(); datas.put("tid", UrlUtil.getTid(mUrl)); datas.put("content", mContent); datas.put(ConstantUtil.KEY_XSRF, xsrf); Map<String, String> cookies = getCookies(); if (!cookies.containsKey(ConstantUtil.KEY_XSRF)) { cookies.put(ConstantUtil.KEY_XSRF, xsrf); } try { Connection.Response res = Jsoup.connect(mUrl).cookies(cookies).headers(headers).data(datas).method(Connection.Method.POST).execute(); if (res.statusCode() == ConstantUtil.HTTP_STATUS_200 || res.statusCode() == ConstantUtil.HTTP_STATUS_302) { successOnUI("评论成功"); return; } } catch (IOException e) { e.printStackTrace(); } failedOnUI("评论失败"); }
Example 22
Source Project: astor Source File: ElementTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void testBrHasSpace() { Document doc = Jsoup.parse("<p>Hello<br>there</p>"); assertEquals("Hello there", doc.text()); assertEquals("Hello there", doc.select("p").first().ownText()); doc = Jsoup.parse("<p>Hello <br> there</p>"); assertEquals("Hello there", doc.text()); }
Example 23
Source Project: pybbsMD Source File: FormatUtils.java License: Apache License 2.0 | 5 votes |
public static String handleHtml(String html) { // 保证html不为null html = TextUtils.isEmpty(html) ? "" : html; // 过滤xss Document document = cleaner.clean(Jsoup.parseBodyFragment(html, ApiDefine.HOST_BASE_URL)); // 返回body return document.body().html(); }
Example 24
Source Project: CrawlerPack Source File: CrawlerPack.java License: Apache License 2.0 | 5 votes |
/** * 將 XML 轉化為 Jsoup Document 物件 * * Jsoup 1.9.1+ supported non-ascii tag * ----- * 如果碰到Tag 名稱首字元非 a-zA-Z 的字元,jsoup 會解析為註解 * 所以必需用騙的先置入 prefix * 再改寫xmlParse 在回傳時移除prefix * * @param xml XML format string * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document xmlToJsoupDoc(String xml){ // Tag 首字元非 a-zA-Z 時轉化為註解的問題 //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>") // .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>"); // 將 xml 轉為 jsoup Document 物件 //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) ); Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser() ); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
Example 25
Source Project: dkpro-c4corpus Source File: CybozuLanguageIdentifier.java License: Apache License 2.0 | 5 votes |
@Override public String identifyLanguage(String html) throws IOException { // extracting plain html text Document doc = Jsoup.parse(html); String text = doc.text(); // we might have removed everything -> no lang if (text.isEmpty()) { return UNKNOWN_LANGUAGE; } try { Detector detector = DetectorFactory.create(); detector.append(text); String detectedLang = detector.detect(); ArrayList<Language> detectedProbabilities = detector.getProbabilities(); if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) { return detectedLang; } else { return UNKNOWN_LANGUAGE; } } catch (LangDetectException e) { return UNKNOWN_LANGUAGE; } }
Example 26
Source Project: Bookster Source File: XiaoShuWuService.java License: Apache License 2.0 | 5 votes |
@Override public ArrayList<DownloadBean> getDownloadurls(final String url) throws InterruptedException { latch = new CountDownLatch(1); final ArrayList<DownloadBean> urls = new ArrayList<>(); mExecutorService.submit(new Runnable() { @Override public void run() { Document document = null; try { document = Jsoup.connect(url) .timeout(10000) .ignoreContentType(true) .ignoreHttpErrors(true) .userAgent(Url.MOBBILE_AGENT) .get(); String u1 = ""; String u1n = document.select("body > div:nth-child(4) > p:nth-child(7)").text(); Elements elements = document.select("body > div.list").select("a"); urls.add(new DownloadBean(u1n, u1)); for (Element element : elements) { urls.add(new DownloadBean(element.text(), element.attr("abs:href"))); } } catch (IOException e) { e.printStackTrace(); } latch.countDown(); } }); latch.await(); return urls; }
Example 27
Source Project: markedj Source File: Marked.java License: Apache License 2.0 | 5 votes |
public static String marked(String src, Options options, Renderer renderer){ Lexer lexer = new Lexer(options); Lexer.LexerResult result = lexer.lex(src); Parser parser = new Parser(options, renderer); String html = parser.parse(result.getTokens(), result.getLinks()); Whitelist whitelist = options.getWhitelist(); if(whitelist != null) { return Jsoup.clean(html, whitelist); } else { return html; } }
Example 28
Source Project: guanggoo-android Source File: NewTopicTask.java License: Apache License 2.0 | 5 votes |
@Override public void run() { String xsrf = getXsrf(); Map<String, String> headers = new HashMap<>(); headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); headers.put("Content-Type", "application/x-www-form-urlencoded"); Map<String, String> datas = new HashMap<>(); datas.put("title", mTitle); datas.put("content", mContent); datas.put(ConstantUtil.KEY_XSRF, xsrf); Map<String, String> cookies = getCookies(); if (!cookies.containsKey(ConstantUtil.KEY_XSRF)) { cookies.put(ConstantUtil.KEY_XSRF, xsrf); } try { Connection.Response res = Jsoup.connect(mUrl).cookies(cookies).headers(headers).data(datas).method(Connection.Method.POST).execute(); if (res.statusCode() == ConstantUtil.HTTP_STATUS_200 || res.statusCode() == ConstantUtil.HTTP_STATUS_302) { successOnUI("发布成功"); return; } } catch (IOException e) { e.printStackTrace(); } failedOnUI("发布失败"); }
Example 29
Source Project: astor Source File: DocumentTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void testClonesDeclarations() { Document doc = Jsoup.parse("<!DOCTYPE html><html><head><title>Doctype test"); Document clone = doc.clone(); assertEquals(doc.html(), clone.html()); assertEquals("<!doctype html><html><head><title>Doctype test</title></head><body></body></html>", TextUtil.stripNewlines(clone.html())); }
Example 30
Source Project: astor Source File: HtmlParserTest.java License: GNU General Public License v2.0 | 5 votes |
@Test public void handlesQuotesInCommentsInScripts() { String html = "<script>\n" + " <!--\n" + " document.write('</scr' + 'ipt>');\n" + " // -->\n" + "</script>"; Document node = Jsoup.parseBodyFragment(html); assertEquals("<script>\n" + " <!--\n" + " document.write('</scr' + 'ipt>');\n" + " // -->\n" + "</script>", node.body().html()); }