Java Code Examples for org.jsoup.nodes.Document#select()

The following examples show how to use org.jsoup.nodes.Document#select() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SelectorTest.java    From jsoup-learning with MIT License 7 votes vote down vote up
@Test public void testNestedHas() {
    Document doc = Jsoup.parse("<div><p><span>One</span></p></div> <div><p>Two</p></div>");
    Elements divs = doc.select("div:has(p:has(span))");
    assertEquals(1, divs.size());
    assertEquals("One", divs.first().text());

    // test matches in has
    divs = doc.select("div:has(p:matches((?i)two))");
    assertEquals(1, divs.size());
    assertEquals("div", divs.first().tagName());
    assertEquals("Two", divs.first().text());

    // test contains in has
    divs = doc.select("div:has(p:contains(two))");
    assertEquals(1, divs.size());
    assertEquals("div", divs.first().tagName());
    assertEquals("Two", divs.first().text());
}
 
Example 2
Source File: ZhilianEmailResumeParser.java    From job with MIT License 6 votes vote down vote up
protected void tryFetchContact(ZhilianResume resume, Document doc) {
  final String SPLIT1 = "url=";
  final String SPLIT2 = "ldparam=";
  Elements as = doc.select("table table table table tr td a"); 
  for(Element elem : as) {
    String href = elem.attr("href");
    if(href.contains(SPLIT2) && href.contains(SPLIT1)) {
      String url = href.substring(href.lastIndexOf(SPLIT1) + SPLIT1.length(), href.length());
      String content;
      try {
        content = Request.Get(url).execute().returnContent().asString();
        Document doc2 = Jsoup.parse(content);
        Elements infos = doc2.select("div.login_content p a");
        resume.setName(infos.get(0).text());
        resume.setPhone(infos.get(1).text());
        resume.setMail(infos.get(2).text());
      } catch (Exception e) {
        e.printStackTrace(System.err);
      } 
      
      return;
    }
  }
}
 
Example 3
Source File: XiuMM.java    From PicKing with Apache License 2.0 6 votes vote down vote up
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("div.album");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();

        Elements title = element.select("span.name");
        if (title.size() > 0)
            temp.setTitle(title.get(0).text());

        Elements album = element.select(".pic_box a");
        temp.setAlbumUrl(album.attr("href"));
        Elements pic = album.select("img");
        if (pic.size() > 0)
            temp.setPicUrl(pic.get(0).attr("src"));
        urls.add(temp);
    }
    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 4
Source File: JsoupCssInliner.java    From ogham with Apache License 2.0 6 votes vote down vote up
/**
 * Applies the styles to a <code>data-cssstyle</code> attribute. This is
 * because the styles need to be applied sequentially, but before the
 * <code>style</code> defined for the element inline.
 *
 * @param doc
 *            the html document
 */
private static void extractStyles(Document doc, String stylesheet) {
	String cleanedStylesheet = ignoreAtRules(stylesheet);
	cleanedStylesheet = NEW_LINES.matcher(cleanedStylesheet).replaceAll("");
	cleanedStylesheet = COMMENTS.matcher(cleanedStylesheet).replaceAll("");
	cleanedStylesheet = SPACES.matcher(cleanedStylesheet).replaceAll(" ");
	String styleRules = cleanedStylesheet.trim();
	String delims = "{}";
	StringTokenizer st = new StringTokenizer(styleRules, delims);
	while (st.countTokens() > 1) {
		String selector = st.nextToken();
		String properties = st.nextToken();
		Elements selectedElements = doc.select(selector.trim());
		for (Element selElem : selectedElements) {
			String oldProperties = selElem.attr(TEMP_STYLE_ATTR);
			selElem.attr(TEMP_STYLE_ATTR, oldProperties.length() > 0 ? concatenateProperties(oldProperties, properties) : properties);
		}
	}
}
 
Example 5
Source File: HtmlHelper.java    From FairEmail with GNU General Public License v3.0 6 votes vote down vote up
private static String _getText(Document d, boolean full) {
    truncate(d, !full);

    for (Element bq : d.select("blockquote")) {
        bq.prependChild(new TextNode("["));
        bq.appendChild(new TextNode("]"));
    }

    String text = d.text();
    if (full)
        return text;

    String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE));
    if (preview.length() < text.length())
        preview += "…";

    return preview;
}
 
Example 6
Source File: SpiderCheckThread.java    From sitemonitoring-production with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) {
		log.debug("find urls on this web page: " + referer);
		if (abort) {
			appendMessage("aborted");
			return;
		}
		Document document = Jsoup.parse(htmlPage);
		Elements newsHeadlines = document.select("a");
		Iterator<Element> iterator = newsHeadlines.iterator();
		while (iterator.hasNext()) {
			if (abort) {
				appendMessage("aborted");
				break;
			}
			Element element = (Element) iterator.next();
			element.setBaseUri(referer);
//			System.out.println("base uri: "+ check.getUrl());
//			System.out.println("referer: "+ referer);
			String url = element.absUrl("href").trim();
			log.debug("spider check found url: " + url);
			if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) {
				log.debug("spider check put to all pages url: " + url);
				allPages.put(url, referer);
			}
		}
	}
 
Example 7
Source File: ITClassFileFormatVersion.java    From japicmp with Apache License 2.0 6 votes vote down vote up
@Test
public void testClassFileFormatVersionIsPresent() throws IOException {
	Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html");
	if (!Files.exists(htmlPath)) {
		return; //in JDK 1.7 case
	}
	Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString());
	Elements classFileFormatElements = document.select(".class_fileFormatVersion");
	assertThat(classFileFormatElements.isEmpty(), is(false));
	Elements tdCells = classFileFormatElements.select("table > tbody > tr > td");
	assertThat(tdCells.isEmpty(), is(false));
	for (Element element : tdCells) {
		String text = element.text();
		if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) {
			Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text);
		}
	}
}
 
Example 8
Source File: NexusParser.java    From Hentoid with Apache License 2.0 6 votes vote down vote up
@Override
protected List<String> parseImages(@NonNull Content content) throws IOException {
    List<String> result = new ArrayList<>();

    progressStart(content.getQtyPages());
    /*
     * Open all pages and grab the URL of the displayed image
     */
    for (int i = 0; i < content.getQtyPages(); i++) {
        String readerUrl = content.getReaderUrl().replace("001", Helper.formatIntAsStr(i + 1, 3));
        Document doc = getOnlineDocument(readerUrl);
        if (doc != null) {
            Elements elements = doc.select("section a img");
            if (elements != null && !elements.isEmpty()) {
                Element e = elements.first();
                result.add(e.attr("src"));
            }
        }
        progressPlus();
    }

    progressComplete();

    return result;
}
 
Example 9
Source File: AuthCheckTask.java    From guanggoo-android with Apache License 2.0 6 votes vote down vote up
@Override
protected void successOnUI(String data) {
    super.successOnUI(data);

    if (!mIsCanceled) {
        Document doc;
        try {
            doc = get(ConstantUtil.VERIFY_TELEPHONE_URL);
        } catch (IOException e) {
            e.printStackTrace();
            return;
        }

        Elements elements = doc.select("button#getSmsCode");

        final boolean telephoneVerified = elements.isEmpty();

        mHandler.post(() -> {
            App.getInstance().mGlobal.telephoneVerified.setValue(telephoneVerified);
        });
    }
}
 
Example 10
Source File: Jsp101HeadingsTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoneHeading() {
  Document document =
      Jsoup.parseBodyFragment(
          "<p><b>This is a group heading:</b></p><p>This is not a group heading</p><p>This is not a group heading.</p>");

  manipulator.manipulate(document);

  Elements h2s = document.select("h2");
  assertEquals(0, h2s.size());
}
 
Example 11
Source File: EndToEndTest.java    From js-dossier with Apache License 2.0 5 votes vote down vote up
private static String extractPageData(Document document) {
  Elements elements = document.select("main[data-page-data]");
  checkState(!elements.isEmpty(), "Main element not found in %s", document);
  Element element = Iterables.getOnlyElement(elements);

  Gson gson = new GsonBuilder().setPrettyPrinting().create();

  String data = element.attributes().dataset().get("page-data");
  JsonArray json = gson.fromJson(data, JsonArray.class);
  return gson.toJson(json).trim();
}
 
Example 12
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void testPseudoEquals() {
    Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>");
    Elements ps = doc.select("div p:eq(0)");
    assertEquals(2, ps.size());
    assertEquals("One", ps.get(0).text());
    assertEquals("Four", ps.get(1).text());

    Elements ps2 = doc.select("div:eq(0) p:eq(0)");
    assertEquals(1, ps2.size());
    assertEquals("One", ps2.get(0).text());
    assertEquals("p", ps2.get(0).tagName());
}
 
Example 13
Source File: NewsfilterRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
protected List<String> getURLsFromPage(Document page) {
    List<String> imgURLs = new ArrayList<>();
    Elements thumbnails = page.select("#galleryImages .inner-block img");
    for (Element thumb : thumbnails) {
        String thumbUrl = thumb.attr("src");
        String picUrl = thumbUrl.replace("thumbs/", "");
        // use HTTP instead of HTTPS (less headaches)
        imgURLs.add(picUrl.replaceFirst("https://", "http://"));
    }
    return imgURLs;
}
 
Example 14
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书
 *
 * @return 当前分类下图书的数量
 * @throws IOException 查询失败
 */
public int queryBooksSize() throws IOException {
    checkCookie();
    String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1";
    String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
    String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
    // System.out.println(html);
    Document doc = Jsoup.parse(html);
    Elements form = doc.select("input[name=totalnumber]");
    if (!form.isEmpty()) {
        String booksize = form.get(0).attr("value");
        return Integer.parseInt(booksize);
    }
    return 0;
}
 
Example 15
Source File: JsoupCssInliner.java    From ogham with Apache License 2.0 5 votes vote down vote up
/**
 * Replace link tags with style tags in order to keep the same inclusion
 * order
 *
 * @param doc
 *            the html document
 * @param cssContents
 *            the list of external css files with their content
 */
private static void internStyles(Document doc, List<ExternalCss> cssContents) {
	Elements els = doc.select(CSS_LINKS_SELECTOR);
	for (Element e : els) {
		if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) {
			String path = e.attr(HREF_ATTR);
			ExternalCss css = getCss(cssContents, path);
			if (css != null) {
				Element style = new Element(Tag.valueOf(STYLE_TAG), "");
				style.appendChild(new DataNode(getCssContent(css)));
				e.replaceWith(style);
			}
		}
	}
}
 
Example 16
Source File: ListalRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
public Document getNextPage(Document page) throws IOException {
    Document nextPage = super.getNextPage(page);
    switch (urlType) {
        case LIST:
            if (!page.select(".loadmoreitems").isEmpty()) {
                // All items are not loaded.
                // Load remaining items using postUrl.

                String offSet = page.select(".loadmoreitems").last().attr("data-offset");
                Map<String, String> postParams = new HashMap<>();
                postParams.put("listid", listId);
                postParams.put("offset", offSet);
                try {
                    nextPage = Http.url(postUrl).data(postParams).retries(3).post();
                } catch (IOException e1) {
                    LOGGER.error("Failed to load more images after " + offSet, e1);
                    throw e1;
                }
            }
            break;

        case FOLDER:
            Elements pageLinks = page.select(".pages a");
            if (!pageLinks.isEmpty() && pageLinks.last().text().startsWith("Next")) {
                String nextUrl = pageLinks.last().attr("abs:href");
                nextPage = Http.url(nextUrl).retries(3).get();
            }
            break;

        case UNKNOWN:
        default:
    }
    return nextPage;
}
 
Example 17
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
@Test public void testByAttributeRegexCombined() {
    Document doc = Jsoup.parse("<div><table class=x><td>Hello</td></table></div>");
    Elements els = doc.select("div table[class~=x|y]");
    assertEquals(1, els.size());
    assertEquals("Hello", els.text());
}
 
Example 18
Source File: RenRenCiDianSentence.java    From ankihelper with GNU General Public License v3.0 4 votes vote down vote up
public List<Definition> wordLookup(String key) {
        try {
//            Document doc = Jsoup.connect(wordUrl + key)
//                    .userAgent("Mozilla")
//                    .timeout(5000)
//                    .get();
            Request request = new Request.Builder().url(wordUrl + key)
                    //.addHeader("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36")
                    .addHeader("User-Agent", Constant.UA)
                    .build();
            String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
            Document doc = Jsoup.parse(rawhtml);
            List<Definition> definitionList = new ArrayList<>();

            for(Element audioEle : doc.select("ul.slides > li")){
                HashMap<String, String> eleMap = new HashMap<>();
                String audioUrl = "";
                Elements audioElements = audioEle.select("audio");
                if(audioElements.size() > 0){
                    audioUrl = audioElements.get(0).attr("src");
                }
                String audioName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".mp3";
                String imageUrl = "";
                Elements imageElements = audioEle.select("img");
                if(imageElements.size() > 0){
                    imageUrl = imageElements.get(0).attr("src");
                }
                String imageName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".png";
                String channel = getSingleQueryResult(audioEle, "div.mTop", false).trim();
                String en = getSingleQueryResult(audioEle, "div.mBottom", true)
                        .replaceAll("<em>", "<b>")
                        .replaceAll("</em>", "</b>");
                String cn = getSingleQueryResult(audioEle, "div.mFoot", true)
                         .replaceAll("<em>", "<b>")
                        .replaceAll("</em>", "</b>");
                String context = getSingleQueryResult(audioEle, "div.mTextend", true);
                String detailUrl = "http://www.91dict.com" + audioEle.select("a.viewdetail").get(0).attr("href");
                String audioTag = String.format("[sound:%s]", Constant.AUDIO_SUB_DIRECTORY + File.separator + audioName);
                String html = String.format(tplt_card,
                        en,
                        audioTag,
                        cn,
                        "<font color=grey>" + channel + "</font>",
                        Constant.IMAGE_SUB_DIRECTORY + File.separator + imageName,
                        detailUrl
                        );

                String html_ui = String.format(tplt_ui,
                        en,
                        cn,
                        "<font color=grey>" + channel + "</font>"
                );
                eleMap.put(EXP_ELE[0], key);
                eleMap.put(EXP_ELE[1], html);
                definitionList.add(new Definition(eleMap, html_ui, imageUrl, imageName, audioUrl, audioName));
            }

            return definitionList;

        } catch (IOException ioe) {
            //Log.d("time out", Log.getStackTraceString(ioe));
            //Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show();
            return new ArrayList<Definition>();
        }

    }
 
Example 19
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
@Test public void testPseudoCombined() {
    Document doc = Jsoup.parse("<div class='foo'><p>One</p><p>Two</p></div><div><p>Three</p><p>Four</p></div>");
    Elements ps = doc.select("div.foo p:gt(0)");
    assertEquals(1, ps.size());
    assertEquals("Two", ps.get(0).text());
}
 
Example 20
Source File: HiParser.java    From hipda with GNU General Public License v2.0 4 votes vote down vote up
private static SimpleListBean parseSmsDetail(Document doc) {
    if (doc == null) {
        return null;
    }

    //get my uid and username
    Elements uidMenuES = doc.select("#umenu cite a.noborder");
    if (uidMenuES.size() < 1) {
        return null;
    }
    String mySpaceUrl = Utils.nullToText(uidMenuES.first().attr("href"));
    String myUid = Utils.getMiddleString(mySpaceUrl, "uid=", "&");
    String myUsername = uidMenuES.first().text();

    Elements smslistES = doc.select("li.s_clear");
    if (smslistES.size() < 1) {
        return null;
    }

    SimpleListBean list = new SimpleListBean();
    for (int i = 0; i < smslistES.size(); ++i) {
        Element smsE = smslistES.get(i);
        SimpleListItemBean item = new SimpleListItemBean();

        // author
        Elements pciteES = smsE.select("p.cite");
        if (pciteES.size() == 0) {
            continue;
        }
        Elements citeES = pciteES.first().select("cite");
        if (citeES.size() == 0) {
            continue;
        }
        item.setAuthor(citeES.first().text());

        // avatar
        Elements avatarES = smsE.select("a.avatar");
        if (avatarES.size() > 0) {
            if (item.getAuthor().equals(myUsername)) {
                item.setUid(myUid);
            } else {
                String spaceUrl = Utils.nullToText(avatarES.first().attr("href"));
                item.setUid(Utils.getMiddleString(spaceUrl, "uid=", "&"));
            }
            item.setAvatarUrl(HiUtils.getAvatarUrlByUid(item.getUid()));
        }

        // time
        item.setTime(pciteES.first().ownText());

        // info
        Elements summaryES = smsE.select("div.summary");
        if (summaryES.size() == 0) {
            continue;
        }
        item.setInfo(summaryES.first().html());

        // new
        Elements imgES = pciteES.first().select("img");
        if (imgES.size() > 0) {
            if (imgES.first().attr("src").contains(HiUtils.NewPMImage)) {
                item.setNew(true);
            }
        }

        list.add(item);
    }

    return list;
}