Java Code Examples for org.jsoup.nodes.Document#select()

The following examples show how to use org.jsoup.nodes.Document#select() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: jsoup-learning   File: SelectorTest.java    License: MIT License 7 votes vote down vote up
@Test public void testNestedHas() {
    Document doc = Jsoup.parse("<div><p><span>One</span></p></div> <div><p>Two</p></div>");
    Elements divs = doc.select("div:has(p:has(span))");
    assertEquals(1, divs.size());
    assertEquals("One", divs.first().text());

    // test matches in has
    divs = doc.select("div:has(p:matches((?i)two))");
    assertEquals(1, divs.size());
    assertEquals("div", divs.first().tagName());
    assertEquals("Two", divs.first().text());

    // test contains in has
    divs = doc.select("div:has(p:contains(two))");
    assertEquals(1, divs.size());
    assertEquals("div", divs.first().tagName());
    assertEquals("Two", divs.first().text());
}
 
Example 2
Source Project: guanggoo-android   File: AuthCheckTask.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected void successOnUI(String data) {
    super.successOnUI(data);

    if (!mIsCanceled) {
        Document doc;
        try {
            doc = get(ConstantUtil.VERIFY_TELEPHONE_URL);
        } catch (IOException e) {
            e.printStackTrace();
            return;
        }

        Elements elements = doc.select("button#getSmsCode");

        final boolean telephoneVerified = elements.isEmpty();

        mHandler.post(() -> {
            App.getInstance().mGlobal.telephoneVerified.setValue(telephoneVerified);
        });
    }
}
 
Example 3
Source Project: Hentoid   File: NexusParser.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected List<String> parseImages(@NonNull Content content) throws IOException {
    List<String> result = new ArrayList<>();

    progressStart(content.getQtyPages());
    /*
     * Open all pages and grab the URL of the displayed image
     */
    for (int i = 0; i < content.getQtyPages(); i++) {
        String readerUrl = content.getReaderUrl().replace("001", Helper.formatIntAsStr(i + 1, 3));
        Document doc = getOnlineDocument(readerUrl);
        if (doc != null) {
            Elements elements = doc.select("section a img");
            if (elements != null && !elements.isEmpty()) {
                Element e = elements.first();
                result.add(e.attr("src"));
            }
        }
        progressPlus();
    }

    progressComplete();

    return result;
}
 
Example 4
Source Project: job   File: ZhilianEmailResumeParser.java    License: MIT License 6 votes vote down vote up
protected void tryFetchContact(ZhilianResume resume, Document doc) {
  final String SPLIT1 = "url=";
  final String SPLIT2 = "ldparam=";
  Elements as = doc.select("table table table table tr td a"); 
  for(Element elem : as) {
    String href = elem.attr("href");
    if(href.contains(SPLIT2) && href.contains(SPLIT1)) {
      String url = href.substring(href.lastIndexOf(SPLIT1) + SPLIT1.length(), href.length());
      String content;
      try {
        content = Request.Get(url).execute().returnContent().asString();
        Document doc2 = Jsoup.parse(content);
        Elements infos = doc2.select("div.login_content p a");
        resume.setName(infos.get(0).text());
        resume.setPhone(infos.get(1).text());
        resume.setMail(infos.get(2).text());
      } catch (Exception e) {
        e.printStackTrace(System.err);
      } 
      
      return;
    }
  }
}
 
Example 5
@Test
public void testClassFileFormatVersionIsPresent() throws IOException {
	Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html");
	if (!Files.exists(htmlPath)) {
		return; //in JDK 1.7 case
	}
	Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString());
	Elements classFileFormatElements = document.select(".class_fileFormatVersion");
	assertThat(classFileFormatElements.isEmpty(), is(false));
	Elements tdCells = classFileFormatElements.select("table > tbody > tr > td");
	assertThat(tdCells.isEmpty(), is(false));
	for (Element element : tdCells) {
		String text = element.text();
		if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) {
			Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text);
		}
	}
}
 
Example 6
protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) {
		log.debug("find urls on this web page: " + referer);
		if (abort) {
			appendMessage("aborted");
			return;
		}
		Document document = Jsoup.parse(htmlPage);
		Elements newsHeadlines = document.select("a");
		Iterator<Element> iterator = newsHeadlines.iterator();
		while (iterator.hasNext()) {
			if (abort) {
				appendMessage("aborted");
				break;
			}
			Element element = (Element) iterator.next();
			element.setBaseUri(referer);
//			System.out.println("base uri: "+ check.getUrl());
//			System.out.println("referer: "+ referer);
			String url = element.absUrl("href").trim();
			log.debug("spider check found url: " + url);
			if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) {
				log.debug("spider check put to all pages url: " + url);
				allPages.put(url, referer);
			}
		}
	}
 
Example 7
private static String _getText(Document d, boolean full) {
    truncate(d, !full);

    for (Element bq : d.select("blockquote")) {
        bq.prependChild(new TextNode("["));
        bq.appendChild(new TextNode("]"));
    }

    String text = d.text();
    if (full)
        return text;

    String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE));
    if (preview.length() < text.length())
        preview += "…";

    return preview;
}
 
Example 8
Source Project: ogham   File: JsoupCssInliner.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Applies the styles to a <code>data-cssstyle</code> attribute. This is
 * because the styles need to be applied sequentially, but before the
 * <code>style</code> defined for the element inline.
 *
 * @param doc
 *            the html document
 */
private static void extractStyles(Document doc, String stylesheet) {
	String cleanedStylesheet = ignoreAtRules(stylesheet);
	cleanedStylesheet = NEW_LINES.matcher(cleanedStylesheet).replaceAll("");
	cleanedStylesheet = COMMENTS.matcher(cleanedStylesheet).replaceAll("");
	cleanedStylesheet = SPACES.matcher(cleanedStylesheet).replaceAll(" ");
	String styleRules = cleanedStylesheet.trim();
	String delims = "{}";
	StringTokenizer st = new StringTokenizer(styleRules, delims);
	while (st.countTokens() > 1) {
		String selector = st.nextToken();
		String properties = st.nextToken();
		Elements selectedElements = doc.select(selector.trim());
		for (Element selElem : selectedElements) {
			String oldProperties = selElem.attr(TEMP_STYLE_ATTR);
			selElem.attr(TEMP_STYLE_ATTR, oldProperties.length() > 0 ? concatenateProperties(oldProperties, properties) : properties);
		}
	}
}
 
Example 9
Source Project: PicKing   File: XiuMM.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Map<ContentsActivity.parameter, Object> getContent(String baseUrl, String currentUrl, byte[] result, Map<ContentsActivity.parameter, Object> resultMap) throws UnsupportedEncodingException {
    List<AlbumInfo> urls = new ArrayList<>();
    Document document = Jsoup.parse(new String(result, "utf-8"));
    Elements elements = document.select("div.album");
    for (Element element : elements) {
        AlbumInfo temp = new AlbumInfo();

        Elements title = element.select("span.name");
        if (title.size() > 0)
            temp.setTitle(title.get(0).text());

        Elements album = element.select(".pic_box a");
        temp.setAlbumUrl(album.attr("href"));
        Elements pic = album.select("img");
        if (pic.size() > 0)
            temp.setPicUrl(pic.get(0).attr("src"));
        urls.add(temp);
    }
    resultMap.put(ContentsActivity.parameter.CURRENT_URL, currentUrl);
    resultMap.put(ContentsActivity.parameter.RESULT, urls);
    return resultMap;
}
 
Example 10
Source Project: ogham   File: JsoupCssInliner.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Replace link tags with style tags in order to keep the same inclusion
 * order
 *
 * @param doc
 *            the html document
 * @param cssContents
 *            the list of external css files with their content
 */
private static void internStyles(Document doc, List<ExternalCss> cssContents) {
	Elements els = doc.select(CSS_LINKS_SELECTOR);
	for (Element e : els) {
		if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) {
			String path = e.attr(HREF_ATTR);
			ExternalCss css = getCss(cssContents, path);
			if (css != null) {
				Element style = new Element(Tag.valueOf(STYLE_TAG), "");
				style.appendChild(new DataNode(getCssContent(css)));
				e.replaceWith(style);
			}
		}
	}
}
 
Example 11
/**
 * 从服务器查询当前分类下图书的数量。包含所有子分类下的图书
 *
 * @return 当前分类下图书的数量
 * @throws IOException 查询失败
 */
public int queryBooksSize() throws IOException {
    checkCookie();
    String data = "fenlei=" + this.getId() + "&mark=all&Page=1&totalnumber=-1";
    String Url = NJULib.baseUrl + "/markbook/classifybookview.jsp";
    String html = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "GBK", 1000);
    // System.out.println(html);
    Document doc = Jsoup.parse(html);
    Elements form = doc.select("input[name=totalnumber]");
    if (!form.isEmpty()) {
        String booksize = form.get(0).attr("value");
        return Integer.parseInt(booksize);
    }
    return 0;
}
 
Example 12
Source Project: js-dossier   File: EndToEndTest.java    License: Apache License 2.0 5 votes vote down vote up
private static String extractPageData(Document document) {
  Elements elements = document.select("main[data-page-data]");
  checkState(!elements.isEmpty(), "Main element not found in %s", document);
  Element element = Iterables.getOnlyElement(elements);

  Gson gson = new GsonBuilder().setPrettyPrinting().create();

  String data = element.attributes().dataset().get("page-data");
  JsonArray json = gson.fromJson(data, JsonArray.class);
  return gson.toJson(json).trim();
}
 
Example 13
Source Project: baleen   File: Jsp101HeadingsTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNoneHeading() {
  Document document =
      Jsoup.parseBodyFragment(
          "<p><b>This is a group heading:</b></p><p>This is not a group heading</p><p>This is not a group heading.</p>");

  manipulator.manipulate(document);

  Elements h2s = document.select("h2");
  assertEquals(0, h2s.size());
}
 
Example 14
Source Project: astor   File: SelectorTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test public void testPseudoEquals() {
    Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>");
    Elements ps = doc.select("div p:eq(0)");
    assertEquals(2, ps.size());
    assertEquals("One", ps.get(0).text());
    assertEquals("Four", ps.get(1).text());

    Elements ps2 = doc.select("div:eq(0) p:eq(0)");
    assertEquals(1, ps2.size());
    assertEquals("One", ps2.get(0).text());
    assertEquals("p", ps2.get(0).tagName());
}
 
Example 15
Source Project: ripme   File: NewsfilterRipper.java    License: MIT License 5 votes vote down vote up
@Override
protected List<String> getURLsFromPage(Document page) {
    List<String> imgURLs = new ArrayList<>();
    Elements thumbnails = page.select("#galleryImages .inner-block img");
    for (Element thumb : thumbnails) {
        String thumbUrl = thumb.attr("src");
        String picUrl = thumbUrl.replace("thumbs/", "");
        // use HTTP instead of HTTPS (less headaches)
        imgURLs.add(picUrl.replaceFirst("https://", "http://"));
    }
    return imgURLs;
}
 
Example 16
Source Project: ripme   File: ListalRipper.java    License: MIT License 5 votes vote down vote up
@Override
public Document getNextPage(Document page) throws IOException {
    Document nextPage = super.getNextPage(page);
    switch (urlType) {
        case LIST:
            if (!page.select(".loadmoreitems").isEmpty()) {
                // All items are not loaded.
                // Load remaining items using postUrl.

                String offSet = page.select(".loadmoreitems").last().attr("data-offset");
                Map<String, String> postParams = new HashMap<>();
                postParams.put("listid", listId);
                postParams.put("offset", offSet);
                try {
                    nextPage = Http.url(postUrl).data(postParams).retries(3).post();
                } catch (IOException e1) {
                    LOGGER.error("Failed to load more images after " + offSet, e1);
                    throw e1;
                }
            }
            break;

        case FOLDER:
            Elements pageLinks = page.select(".pages a");
            if (!pageLinks.isEmpty() && pageLinks.last().text().startsWith("Next")) {
                String nextUrl = pageLinks.last().attr("abs:href");
                nextPage = Http.url(nextUrl).retries(3).get();
            }
            break;

        case UNKNOWN:
        default:
    }
    return nextPage;
}
 
Example 17
public List<Definition> wordLookup(String key) {
        try {
//            Document doc = Jsoup.connect(wordUrl + key)
//                    .userAgent("Mozilla")
//                    .timeout(5000)
//                    .get();
            Request request = new Request.Builder().url(wordUrl + key)
                    //.addHeader("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36")
                    .addHeader("User-Agent", Constant.UA)
                    .build();
            String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
            Document doc = Jsoup.parse(rawhtml);
            List<Definition> definitionList = new ArrayList<>();

            for(Element audioEle : doc.select("ul.slides > li")){
                HashMap<String, String> eleMap = new HashMap<>();
                String audioUrl = "";
                Elements audioElements = audioEle.select("audio");
                if(audioElements.size() > 0){
                    audioUrl = audioElements.get(0).attr("src");
                }
                String audioName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".mp3";
                String imageUrl = "";
                Elements imageElements = audioEle.select("img");
                if(imageElements.size() > 0){
                    imageUrl = imageElements.get(0).attr("src");
                }
                String imageName = key + "_rrcd_" + Utils.getRandomHexString(8) + ".png";
                String channel = getSingleQueryResult(audioEle, "div.mTop", false).trim();
                String en = getSingleQueryResult(audioEle, "div.mBottom", true)
                        .replaceAll("<em>", "<b>")
                        .replaceAll("</em>", "</b>");
                String cn = getSingleQueryResult(audioEle, "div.mFoot", true)
                         .replaceAll("<em>", "<b>")
                        .replaceAll("</em>", "</b>");
                String context = getSingleQueryResult(audioEle, "div.mTextend", true);
                String detailUrl = "http://www.91dict.com" + audioEle.select("a.viewdetail").get(0).attr("href");
                String audioTag = String.format("[sound:%s]", Constant.AUDIO_SUB_DIRECTORY + File.separator + audioName);
                String html = String.format(tplt_card,
                        en,
                        audioTag,
                        cn,
                        "<font color=grey>" + channel + "</font>",
                        Constant.IMAGE_SUB_DIRECTORY + File.separator + imageName,
                        detailUrl
                        );

                String html_ui = String.format(tplt_ui,
                        en,
                        cn,
                        "<font color=grey>" + channel + "</font>"
                );
                eleMap.put(EXP_ELE[0], key);
                eleMap.put(EXP_ELE[1], html);
                definitionList.add(new Definition(eleMap, html_ui, imageUrl, imageName, audioUrl, audioName));
            }

            return definitionList;

        } catch (IOException ioe) {
            //Log.d("time out", Log.getStackTraceString(ioe));
            //Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show();
            return new ArrayList<Definition>();
        }

    }
 
Example 18
Source Project: hipda   File: HiParser.java    License: GNU General Public License v2.0 4 votes vote down vote up
private static SimpleListBean parseSmsDetail(Document doc) {
    if (doc == null) {
        return null;
    }

    //get my uid and username
    Elements uidMenuES = doc.select("#umenu cite a.noborder");
    if (uidMenuES.size() < 1) {
        return null;
    }
    String mySpaceUrl = Utils.nullToText(uidMenuES.first().attr("href"));
    String myUid = Utils.getMiddleString(mySpaceUrl, "uid=", "&");
    String myUsername = uidMenuES.first().text();

    Elements smslistES = doc.select("li.s_clear");
    if (smslistES.size() < 1) {
        return null;
    }

    SimpleListBean list = new SimpleListBean();
    for (int i = 0; i < smslistES.size(); ++i) {
        Element smsE = smslistES.get(i);
        SimpleListItemBean item = new SimpleListItemBean();

        // author
        Elements pciteES = smsE.select("p.cite");
        if (pciteES.size() == 0) {
            continue;
        }
        Elements citeES = pciteES.first().select("cite");
        if (citeES.size() == 0) {
            continue;
        }
        item.setAuthor(citeES.first().text());

        // avatar
        Elements avatarES = smsE.select("a.avatar");
        if (avatarES.size() > 0) {
            if (item.getAuthor().equals(myUsername)) {
                item.setUid(myUid);
            } else {
                String spaceUrl = Utils.nullToText(avatarES.first().attr("href"));
                item.setUid(Utils.getMiddleString(spaceUrl, "uid=", "&"));
            }
            item.setAvatarUrl(HiUtils.getAvatarUrlByUid(item.getUid()));
        }

        // time
        item.setTime(pciteES.first().ownText());

        // info
        Elements summaryES = smsE.select("div.summary");
        if (summaryES.size() == 0) {
            continue;
        }
        item.setInfo(summaryES.first().html());

        // new
        Elements imgES = pciteES.first().select("img");
        if (imgES.size() > 0) {
            if (imgES.first().attr("src").contains(HiUtils.NewPMImage)) {
                item.setNew(true);
            }
        }

        list.add(item);
    }

    return list;
}
 
Example 19
Source Project: astor   File: SelectorTest.java    License: GNU General Public License v2.0 4 votes vote down vote up
@Test public void testByAttributeRegexCombined() {
    Document doc = Jsoup.parse("<div><table class=x><td>Hello</td></table></div>");
    Elements els = doc.select("div table[class~=x|y]");
    assertEquals(1, els.size());
    assertEquals("Hello", els.text());
}
 
Example 20
Source Project: astor   File: SelectorTest.java    License: GNU General Public License v2.0 4 votes vote down vote up
@Test public void testPseudoCombined() {
    Document doc = Jsoup.parse("<div class='foo'><p>One</p><p>Two</p></div><div><p>Three</p><p>Four</p></div>");
    Elements ps = doc.select("div.foo p:gt(0)");
    assertEquals(1, ps.size());
    assertEquals("Two", ps.get(0).text());
}