Java Code Examples for org.jsoup.nodes.Document#select()
The following examples show how to use
org.jsoup.nodes.Document#select() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SelectorTest.java From jsoup-learning with MIT License | 7 votes |
@Test public void testNestedHas() { Document doc = Jsoup.parse("<div><p><span>One</span></p></div> <div><p>Two</p></div>"); Elements divs = doc.select("div:has(p:has(span))"); assertEquals(1, divs.size()); assertEquals("One", divs.first().text()); // test matches in has divs = doc.select("div:has(p:matches((?i)two))"); assertEquals(1, divs.size()); assertEquals("div", divs.first().tagName()); assertEquals("Two", divs.first().text()); // test contains in has divs = doc.select("div:has(p:contains(two))"); assertEquals(1, divs.size()); assertEquals("div", divs.first().tagName()); assertEquals("Two", divs.first().text()); }
Example 2
Source File: ZhilianEmailResumeParser.java From job with MIT License | 6 votes |
protected void tryFetchContact(ZhilianResume resume, Document doc) { final String SPLIT1 = "url="; final String SPLIT2 = "ldparam="; Elements as = doc.select("table table table table tr td a"); for(Element elem : as) { String href = elem.attr("href"); if(href.contains(SPLIT2) && href.contains(SPLIT1)) { String url = href.substring(href.lastIndexOf(SPLIT1) + SPLIT1.length(), href.length()); String content; try { content = Request.Get(url).execute().returnContent().asString(); Document doc2 = Jsoup.parse(content); Elements infos = doc2.select("div.login_content p a"); resume.setName(infos.get(0).text()); resume.setPhone(infos.get(1).text()); resume.setMail(infos.get(2).text()); } catch (Exception e) { e.printStackTrace(System.err); } return; } } }
Example 3
Source File: XiuMM.java From PicKing with Apache License 2.0 | 6 votes |
@Override public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException { List<AlbumInfo> urls = new ArrayList<>(); Document document = Jsoup.parse(new String(result, "utf-8")); Elements elements = document.select("div.album"); for (Element element : elements) { AlbumInfo temp = new AlbumInfo(); Elements title = element.select("span.name"); if (title.size() > 0) temp.setTitle(title.get(0).text()); Elements album = element.select(".pic_box a"); temp.setAlbumUrl(album.attr("href")); Elements pic = album.select("img"); if (pic.size() > 0) temp.setPicUrl(pic.get(0).attr("src")); urls.add(temp); } resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl); resultMap.put(ContentsActivity.parameter.RESULT, urls); return resultMap; }
Example 4
Source File: JsoupCssInliner.java From ogham with Apache License 2.0 | 6 votes |
/** * Applies the styles to a <code>data-cssstyle</code> attribute. This is * because the styles need to be applied sequentially, but before the * <code>style</code> defined for the element inline. * * @param doc * the html document */ private static void extractStyles(Document doc, String stylesheet) { String cleanedStylesheet = ignoreAtRules(stylesheet); cleanedStylesheet = NEW_LINES.matcher(cleanedStylesheet).replaceAll(""); cleanedStylesheet = COMMENTS.matcher(cleanedStylesheet).replaceAll(""); cleanedStylesheet = SPACES.matcher(cleanedStylesheet).replaceAll(" "); String styleRules = cleanedStylesheet.trim(); String delims = "{}"; StringTokenizer st = new StringTokenizer(styleRules, delims); while (st.countTokens() > 1) { String selector = st.nextToken(); String properties = st.nextToken(); Elements selectedElements = doc.select(selector.trim()); for (Element selElem : selectedElements) { String oldProperties = selElem.attr(TEMP_STYLE_ATTR); selElem.attr(TEMP_STYLE_ATTR, oldProperties.length() > 0 ? concatenateProperties(oldProperties, properties) : properties); } } }
Example 5
Source File: HtmlHelper.java From FairEmail with GNU General Public License v3.0 | 6 votes |
private static String _getText(Document d, boolean full) { truncate(d, !full); for (Element bq : d.select("blockquote")) { bq.prependChild(new TextNode("[")); bq.appendChild(new TextNode("]")); } String text = d.text(); if (full) return text; String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE)); if (preview.length() < text.length()) preview += "…"; return preview; }
Example 6
Source File: SpiderCheckThread.java From sitemonitoring-production with BSD 3-Clause "New" or "Revised" License | 6 votes |
protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) { log.debug("find urls on this web page: " + referer); if (abort) { appendMessage("aborted"); return; } Document document = Jsoup.parse(htmlPage); Elements newsHeadlines = document.select("a"); Iterator<Element> iterator = newsHeadlines.iterator(); while (iterator.hasNext()) { if (abort) { appendMessage("aborted"); break; } Element element = (Element) iterator.next(); element.setBaseUri(referer); // System.out.println("base uri: "+ check.getUrl()); // System.out.println("referer: "+ referer); String url = element.absUrl("href").trim(); log.debug("spider check found url: " + url); if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) { log.debug("spider check put to all pages url: " + url); allPages.put(url, referer); } } }
Example 7
Source File: ITClassFileFormatVersion.java From japicmp with Apache License 2.0 | 6 votes |
@Test public void testClassFileFormatVersionIsPresent() throws IOException { Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html"); if (!Files.exists(htmlPath)) { return; //in JDK 1.7 case } Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString()); Elements classFileFormatElements = document.select(".class_fileFormatVersion"); assertThat(classFileFormatElements.isEmpty(), is(false)); Elements tdCells = classFileFormatElements.select("table > tbody > tr > td"); assertThat(tdCells.isEmpty(), is(false)); for (Element element : tdCells) { String text = element.text(); if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) { Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text); } } }
Example 8
Source File: NexusParser.java From Hentoid with Apache License 2.0 | 6 votes |
@Override protected List<String> parseImages(@NonNull Content content) throws IOException { List<String> result = new ArrayList<>(); progressStart(content.getQtyPages()); /* * Open all pages and grab the URL of the displayed image */ for (int i = 0; i < content.getQtyPages(); i++) { String readerUrl = content.getReaderUrl().replace("001", Helper.formatIntAsStr(i + 1, 3)); Document doc = getOnlineDocument(readerUrl); if (doc != null) { Elements elements = doc.select("section a img"); if (elements != null && !elements.isEmpty()) { Element e = elements.first(); result.add(e.attr("src")); } } progressPlus(); } progressComplete(); return result; }
Example 9
Source File: AuthCheckTask.java From guanggoo-android with Apache License 2.0 | 6 votes |
@Override protected void successOnUI(String data) { super.successOnUI(data); if (!mIsCanceled) { Document doc; try { doc = get(ConstantUtil.VERIFY_TELEPHONE_URL); } catch (IOException e) { e.printStackTrace(); return; } Elements elements = doc.select("button#getSmsCode"); final boolean telephoneVerified = elements.isEmpty(); mHandler.post(() -> { App.getInstance().mGlobal.telephoneVerified.setValue(telephoneVerified); }); } }
Example 10
Source File: Jsp101HeadingsTest.java From baleen with Apache License 2.0 | 5 votes |
@Test public void testNoneHeading() { Document document = Jsoup.parseBodyFragment( "<p><b>This is a group heading:</b></p><p>This is not a group heading</p><p>This is not a group heading.</p>"); manipulator.manipulate(document); Elements h2s = document.select("h2"); assertEquals(0, h2s.size()); }
Example 11
Source File: EndToEndTest.java From js-dossier with Apache License 2.0 | 5 votes |
private static String extractPageData(Document document) { Elements elements = document.select("main[data-page-data]"); checkState(!elements.isEmpty(), "Main element not found in %s", document); Element element = Iterables.getOnlyElement(elements); Gson gson = new GsonBuilder().setPrettyPrinting().create(); String data = element.attributes().dataset().get("page-data"); JsonArray json = gson.fromJson(data, JsonArray.class); return gson.toJson(json).trim(); }
Example 12
Source File: SelectorTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testPseudoEquals() { Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Elements ps = doc.select("div p:eq(0)"); assertEquals(2, ps.size()); assertEquals("One", ps.get(0).text()); assertEquals("Four", ps.get(1).text()); Elements ps2 = doc.select("div:eq(0) p:eq(0)"); assertEquals(1, ps2.size()); assertEquals("One", ps2.get(0).text()); assertEquals("p", ps2.get(0).tagName()); }
Example 13
Source File: NewsfilterRipper.java From ripme with MIT License | 5 votes |
@Override protected List<String> getURLsFromPage(Document page) { List<String> imgURLs = new ArrayList<>(); Elements thumbnails = page.select("#galleryImages .inner-block img"); for (Element thumb : thumbnails) { String thumbUrl = thumb.attr("src"); String picUrl = thumbUrl.replace("thumbs/", ""); // use HTTP instead of HTTPS (less headaches) imgURLs.add(picUrl.replaceFirst("https://", "http://")); } return imgURLs; }
Example 14
Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0 | 5 votes |
/** * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书 * * @return 当前分类下图书的数量 * @throws IOException 查询失败 */ public int queryBooksSize() throws IOException { checkCookie(); String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1"; String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp"; String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000); // System.out.println(html); Document doc = Jsoup.parse(html); Elements form = doc.select("input[name=totalnumber]"); if (!form.isEmpty()) { String booksize = form.get(0).attr("value"); return Integer.parseInt(booksize); } return 0; }
Example 15
Source File: JsoupCssInliner.java From ogham with Apache License 2.0 | 5 votes |
/** * Replace link tags with style tags in order to keep the same inclusion * order * * @param doc * the html document * @param cssContents * the list of external css files with their content */ private static void internStyles(Document doc, List<ExternalCss> cssContents) { Elements els = doc.select(CSS_LINKS_SELECTOR); for (Element e : els) { if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) { String path = e.attr(HREF_ATTR); ExternalCss css = getCss(cssContents, path); if (css != null) { Element style = new Element(Tag.valueOf(STYLE_TAG), ""); style.appendChild(new DataNode(getCssContent(css))); e.replaceWith(style); } } } }
Example 16
Source File: ListalRipper.java From ripme with MIT License | 5 votes |
@Override public Document getNextPage(Document page) throws IOException { Document nextPage = super.getNextPage(page); switch (urlType) { case LIST: if (!page.select(".loadmoreitems").isEmpty()) { // All items are not loaded. // Load remaining items using postUrl. String offSet = page.select(".loadmoreitems").last().attr("data-offset"); Map<String, String> postParams = new HashMap<>(); postParams.put("listid", listId); postParams.put("offset", offSet); try { nextPage = Http.url(postUrl).data(postParams).retries(3).post(); } catch (IOException e1) { LOGGER.error("Failed to load more images after " + offSet, e1); throw e1; } } break; case FOLDER: Elements pageLinks = page.select(".pages a"); if (!pageLinks.isEmpty() && pageLinks.last().text().startsWith("Next")) { String nextUrl = pageLinks.last().attr("abs:href"); nextPage = Http.url(nextUrl).retries(3).get(); } break; case UNKNOWN: default: } return nextPage; }
Example 17
Source File: SelectorTest.java From astor with GNU General Public License v2.0 | 4 votes |
@Test public void testByAttributeRegexCombined() { Document doc = Jsoup.parse("<div><table class=x><td>Hello</td></table></div>"); Elements els = doc.select("div table[class~=x|y]"); assertEquals(1, els.size()); assertEquals("Hello", els.text()); }
Example 18
Source File: RenRenCiDianSentence.java From ankihelper with GNU General Public License v3.0 | 4 votes |
public List<Definition> wordLookup(String key) { try { // Document doc = Jsoup.connect(wordUrl + key) // .userAgent("Mozilla") // .timeout(5000) // .get(); Request request = new Request.Builder().url(wordUrl + key) //.addHeader("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36") .addHeader("User-Agent", Constant.UA) .build(); String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string(); Document doc = Jsoup.parse(rawhtml); List<Definition> definitionList = new ArrayList<>(); for(Element audioEle : doc.select("ul.slides > li")){ HashMap<String, String> eleMap = new HashMap<>(); String audioUrl = ""; Elements audioElements = audioEle.select("audio"); if(audioElements.size() > 0){ audioUrl = audioElements.get(0).attr("src"); } String audioName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".mp3"; String imageUrl = ""; Elements imageElements = audioEle.select("img"); if(imageElements.size() > 0){ imageUrl = imageElements.get(0).attr("src"); } String imageName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".png"; String channel = getSingleQueryResult(audioEle, "div.mTop", false).trim(); String en = getSingleQueryResult(audioEle, "div.mBottom", true) .replaceAll("<em>", "<b>") .replaceAll("</em>", "</b>"); String cn = getSingleQueryResult(audioEle, "div.mFoot", true) .replaceAll("<em>", "<b>") .replaceAll("</em>", "</b>"); String context = getSingleQueryResult(audioEle, "div.mTextend", true); String detailUrl = "http://www.91dict.com" + audioEle.select("a.viewdetail").get(0).attr("href"); String audioTag = String.format("[sound:%s]", Constant.AUDIO_SUB_DIRECTORY + File.separator + audioName); String html = String.format(tplt_card, en, audioTag, cn, "<font color=grey>" + channel + "</font>", Constant.IMAGE_SUB_DIRECTORY + File.separator + imageName, detailUrl ); String html_ui = String.format(tplt_ui, en, cn, "<font color=grey>" + channel + "</font>" ); eleMap.put(EXP_ELE[0], key); eleMap.put(EXP_ELE[1], html); definitionList.add(new Definition(eleMap, html_ui, imageUrl, imageName, audioUrl, audioName)); } return definitionList; } catch (IOException ioe) { //Log.d("time out", Log.getStackTraceString(ioe)); //Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show(); return new ArrayList<Definition>(); } }
Example 19
Source File: SelectorTest.java From astor with GNU General Public License v2.0 | 4 votes |
@Test public void testPseudoCombined() { Document doc = Jsoup.parse("<div class='foo'><p>One</p><p>Two</p></div><div><p>Three</p><p>Four</p></div>"); Elements ps = doc.select("div.foo p:gt(0)"); assertEquals(1, ps.size()); assertEquals("Two", ps.get(0).text()); }
Example 20
Source File: HiParser.java From hipda with GNU General Public License v2.0 | 4 votes |
private static SimpleListBean parseSmsDetail(Document doc) { if (doc == null) { return null; } //get my uid and username Elements uidMenuES = doc.select("#umenu cite a.noborder"); if (uidMenuES.size() < 1) { return null; } String mySpaceUrl = Utils.nullToText(uidMenuES.first().attr("href")); String myUid = Utils.getMiddleString(mySpaceUrl, "uid=", "&"); String myUsername = uidMenuES.first().text(); Elements smslistES = doc.select("li.s_clear"); if (smslistES.size() < 1) { return null; } SimpleListBean list = new SimpleListBean(); for (int i = 0; i < smslistES.size(); ++i) { Element smsE = smslistES.get(i); SimpleListItemBean item = new SimpleListItemBean(); // author Elements pciteES = smsE.select("p.cite"); if (pciteES.size() == 0) { continue; } Elements citeES = pciteES.first().select("cite"); if (citeES.size() == 0) { continue; } item.setAuthor(citeES.first().text()); // avatar Elements avatarES = smsE.select("a.avatar"); if (avatarES.size() > 0) { if (item.getAuthor().equals(myUsername)) { item.setUid(myUid); } else { String spaceUrl = Utils.nullToText(avatarES.first().attr("href")); item.setUid(Utils.getMiddleString(spaceUrl, "uid=", "&")); } item.setAvatarUrl(HiUtils.getAvatarUrlByUid(item.getUid())); } // time item.setTime(pciteES.first().ownText()); // info Elements summaryES = smsE.select("div.summary"); if (summaryES.size() == 0) { continue; } item.setInfo(summaryES.first().html()); // new Elements imgES = pciteES.first().select("img"); if (imgES.size() > 0) { if (imgES.first().attr("src").contains(HiUtils.NewPMImage)) { item.setNew(true); } } list.add(item); } return list; }