Java Code Examples for org.jsoup.select.Elements#iterator()

The following examples show how to use org.jsoup.select.Elements#iterator() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HtmlExporterTest.java From LicenseScout with Apache License 2.0

6 votes

@Override
protected void assertResultContent(TestVariant testVariant, final String resultContent) {
    final Document doc = Jsoup.parse(resultContent);
    final Elements metaElements = doc.getElementsByTag("meta");
    final Iterator<Element> iter = metaElements.iterator();
    while (iter.hasNext()) {
        final Element element = iter.next();
        final Attributes attributes = element.attributes();
        if (attributes.get("http-equiv") != null) {
            final String contentAttribute = attributes.get("content");
            final String expected = "charset=" + testVariant.getOutputCharset().name();
            Assert.assertTrue("Encoding", contentAttribute.endsWith(expected));
        }
    }
    Assert.assertNotNull("Detection statistics table present", doc.getElementById("detection_statistics_table"));
    Assert.assertNotNull("Legal statistics table present", doc.getElementById("legal_statistics_table"));
    Assert.assertNotNull("Genral statistics table present", doc.getElementById("general_statistics_table"));
    Assert.assertNotNull("Main table present", doc.getElementById("license_table"));
}

Example 2

Source File: RenderHelpler.java From jboot with Apache License 2.0

6 votes

private static void replace(Elements elements, String attrName, String domain) {
    Iterator<Element> iterator = elements.iterator();
    while (iterator.hasNext()) {

        Element element = iterator.next();

        if (element.hasAttr("cdn-exclude")) {
            continue;
        }

        String url = element.attr(attrName);
        if (StrUtil.isBlank(url) || !url.startsWith("/") || url.startsWith("//")) {
            continue;
        }

        url = domain + url;

        element.attr(attrName, url);
    }
}

Example 3

Source File: SpiderCheckThread.java From sitemonitoring-production with BSD 3-Clause "New" or "Revised" License

6 votes

protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) {
		log.debug("find urls on this web page: " + referer);
		if (abort) {
			appendMessage("aborted");
			return;
		}
		Document document = Jsoup.parse(htmlPage);
		Elements newsHeadlines = document.select("a");
		Iterator<Element> iterator = newsHeadlines.iterator();
		while (iterator.hasNext()) {
			if (abort) {
				appendMessage("aborted");
				break;
			}
			Element element = (Element) iterator.next();
			element.setBaseUri(referer);
//			System.out.println("base uri: "+ check.getUrl());
//			System.out.println("referer: "+ referer);
			String url = element.absUrl("href").trim();
			log.debug("spider check found url: " + url);
			if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) {
				log.debug("spider check put to all pages url: " + url);
				allPages.put(url, referer);
			}
		}
	}

Example 4

Source File: SpiderCheckThread.java From sitemonitoring-production with BSD 3-Clause "New" or "Revised" License

6 votes

protected void findUrls(String referer, String htmlPage, Map<String, String> allPages) {
		log.debug("find urls on this web page: " + referer);
		if (abort) {
			appendMessage("aborted");
			return;
		}
		Document document = Jsoup.parse(htmlPage);
		Elements newsHeadlines = document.select("a");
		Iterator<Element> iterator = newsHeadlines.iterator();
		while (iterator.hasNext()) {
			if (abort) {
				appendMessage("aborted");
				break;
			}
			Element element = (Element) iterator.next();
			element.setBaseUri(referer);
//			System.out.println("base uri: "+ check.getUrl());
//			System.out.println("referer: "+ referer);
			String url = element.absUrl("href").trim();
			log.debug("spider check found url: " + url);
			if (!url.toString().isEmpty() && !url.startsWith("mailto:") && !SinglePageCheckService.ignoreUrl(url, check.getDoNotFollowUrls()) && url.startsWith(check.getUrl()) && !url.equals(referer)) {
				log.debug("spider check put to all pages url: " + url);
				allPages.put(url, referer);
			}
		}
	}

Example 5

Source File: HtmlUtil.java From V2EX with GNU General Public License v3.0

5 votes

private static List<Reply> getReplies(Document document, String poster){

        Elements elements = document.select("#Main > .box > .cell[id]");
        Iterator<Element> elementIterator = elements.iterator();

        List<Reply> replies = new ArrayList<>(elements.size());
        for (int f=0; elementIterator.hasNext(); f++) {
            Element e = elementIterator.next();
            Reply reply = new Reply();

            Element element = e.selectFirst(".reply_content");
            if (element != null){
                for (Element img:element.select("img")){
                    img.attr("width","100%");
                    img.attr("height","auto");
                }
                reply.setContent(element.html());
            }else{
                throw new V2exException("This post seems to have been blocked\nEmpty reply content");
            }

            String cell = e.toString();
            int id = matcherGroup1Int(PATTERN_REPLY_ID, cell);
            String username = matcherGroup1(PATTERN_REPLY_USERNAME, cell);
            String avatarNormal = matcherGroup1(PATTERN_REPLY_AVATAR, cell);

            reply.setId(id);
            reply.setMember(new Member(username, avatarNormal));
            if (poster != null) reply.setPoster(username.equals(poster));
            reply.setAgo(matcherGroup1(PATTERN_REPLY_AGO, cell));
            reply.setVia(matcherGroup1(PATTERN_REPLY_VIA, cell));
            reply.setLike(matcherGroup1Int(PATTERN_REPLY_LIKE, cell));
            reply.setFloor(f);

            replies.add(reply);
        }
        return replies;
    }

Example 6

Source File: TemplateRender.java From jpress with GNU Lesser General Public License v3.0

5 votes

private void replacePreviewHref(Elements elements) {
    Iterator<Element> iterator = elements.iterator();
    RequestUtil.getCurrentUrl();
    while (iterator.hasNext()) {
        Element element = iterator.next();
        String url = element.attr("href");
        element.attr("href", buildUrl(url));
    }
}

Example 7

Source File: _WechatArticleImport.java From jpress with GNU Lesser General Public License v3.0

5 votes

private String processContentImages(String content, List<String> imageUrls) {

        Document doc = Jsoup.parse(content);
        Elements imgElements = doc.select("img");
        if (imgElements != null) {
            Iterator<Element> iterator = imgElements.iterator();
            while (iterator.hasNext()) {
                Element element = iterator.next();

                String imageUrl = element.hasAttr("src")
                        ? element.attr("src")
                        : element.attr("data-src");

//http://mmbiz.qpic.cn/mmbiz/4gZTdZfnQeDvQqCZFuVvYv8scGS7sEQTRETgISib1blz5iclAtnsccaJhaugmKc
// hhm8mFOtjnicibibumazy8wPS6Xg/640?tp=webp&wxfrom=5&wx_lazy=1&wx_co=1

                imageUrl = replaceLast(imageUrl, "/", "__");
                imageUrl = imageUrl.startsWith("http://")
                        ? imageUrl.replace("http://", "/attachment/")
                        : imageUrl.replace("https://", "/attachment/s");

                imageUrl = imageUrl.replace("?",".png?");

                element.removeAttr("data-src");
                element.attr("src",imageUrl);

                imageUrls.add(imageUrl);
            }
        }

        return doc.toString();
    }

Example 8

Source File: JobConfParserImpl.java From Eagle with Apache License 2.0

5 votes

public Map<String, String> parse(Document doc) {
	Elements elements = doc.select("table[id=conf]").select("tbody").select("tr");
	Iterator<Element> iter = elements.iterator();
	Map<String, String> configs = new HashMap<String, String>();
	while(iter.hasNext()) {
		Element element = iter.next();
		Elements tds = element.children();
		String key = tds.get(0).text();
		String value = tds.get(1).text();
		configs.put(key, value);
	}
	return configs;
}

Example 9

Source File: JobCountersParserImpl.java From Eagle with Apache License 2.0

5 votes

@Override
public Map<String, Long> parse(Document doc) {
	Elements elements = doc.select("a[href*=singlejobcounter]");
	Iterator<Element> iter = elements.iterator();
	Map<String, Long> counters = new HashMap<String, Long>();
	while(iter.hasNext()) {
		Element element = iter.next().parent();
		String metricName = element.text();
		long metricValue = Long.parseLong(element.nextElementSibling()
							   .nextElementSibling().nextElementSibling().text()
							    .replace(",", "").trim());
		counters.put(metricName, metricValue);
	}
	return counters;
}

Example 10

Source File: WrapElements.java From jphp with Apache License 2.0

5 votes

public WrapElements(Environment env, Elements wrappedObject) {
    super(env, wrappedObject);

    iterator = wrappedObject.iterator();

    if (iterator.hasNext()) {
        iteratorValue = new WrapElement(env, iterator.next());
    }
}

Example 11

Source File: Action.java From templatespider with Apache License 2.0

4 votes

/**
	 * 替换模版页面中的动态标签
	 * 1.替换title标签
	 * 2.删除keywords 、 description
	 */
	public static void replaceDongtaiTag(){
		/*
		 * 遍历出模版页面
		 */
		List<Map<String, String>> templatePageList = new ArrayList<Map<String,String>>();
		
		DefaultTableModel pageModel = Global.mainUI.getTemplatePageTableModel();
		int pageRowCount = pageModel.getRowCount();
		for (int i = 0; i < pageRowCount; i++) {
			Map<String, String> map = new HashMap<String, String>();
			//模版页面名字
			String name = (String) pageModel.getValueAt(i, 0);
			if(name != null && name.length() > 0){
				
				Template temp = Global.templateMap.get(name);
				if(temp != null){
					//有这个模版页面
					Document doc = temp.getDoc();
					
					//删除 keywords 、 description
					Elements metaEles = doc.getElementsByTag("meta");
					Iterator<Element> it = metaEles.iterator();
					while(it.hasNext()){
						Element metaEle = it.next();
						String metaName = metaEle.attr("name");
						if(metaEle != null && metaName != null){
							if(metaName.equalsIgnoreCase("keywords") || metaName.equalsIgnoreCase("description")){
								try {
									metaEle.remove();
									it.remove();
								} catch (Exception e) {
									e.printStackTrace();
									System.out.println(metaEle);
								}
							}
						}
					}
					
					//替换title标签
					Elements titleEles = doc.getElementsByTag("title");
					Element titleEle = null;
					if(titleEles != null && titleEles.size() > 0){
						titleEle = titleEles.first();
					}else{
						//若没有这个title，那么需要新增加一个
						Elements headElements = doc.getElementsByTag("head");
						if(headElements == null || headElements.size() == 0){
							UI.showMessageDialog("模版页面"+temp.getFile().getName()+"中无head标签！模版页估计不完整！请手动补上head标签");
							return;
						}else{
//							titleEle = new Element(tag, baseUri)
//							headElements.first().appendElement(tagName)
							/*
							 * 待加入
							 */
						}
					}
					if(titleEle != null){
						//替换title标签为动态标签
						String type = (String) pageModel.getValueAt(i, 1);
						switch (type) {
						case "首页模版":
							titleEle.text(site_name);
							break;
						case "列表页模版":
							titleEle.text(siteColumn_name+"_"+site_name);
							break;
						case "详情页模版":
							titleEle.text(news_title+"_"+site_name);
							break;
						default:
							titleEle.text(site_name);
							break;
						}
					}
					
					Global.templateMap.put(temp.getFile().getName(), temp);
				}
			}
		}
	}

Example 12

Source File: TemplateRender.java From jpress with GNU Lesser General Public License v3.0

4 votes

private void replace(Elements elements, String attrName) {
    Iterator<Element> iterator = elements.iterator();
    if (currentTemplate == null) {
        return;
    }
    while (iterator.hasNext()) {

        Element element = iterator.next();
        String url = element.attr(attrName);

        if (StrUtil.isBlank(url)
                || url.startsWith("//")
                || url.toLowerCase().startsWith("http")
                || (attrName.equals("src") && url.startsWith("data:image/"))
                || element.hasAttr("cdn-exclude")) {
            continue;
        }

        // 以 / 开头的，需要添加 contextPath
        if (url.startsWith("/")) {
            if (contextPath.length() > 0 && url.startsWith(contextPath + "/") == false) {
                url = contextPath + url;
            }
        }

        // 以 ./ 开头的文件，需要添加模板路径
        else if (url.startsWith("./")) {
            url = contextPath + currentTemplate.getRelativePath() + url.substring(1);
        }

        // 直接是文件目录名开头
        else {
            url = contextPath + currentTemplate.getRelativePath() + "/" + url;
        }

        if (StrUtil.isNotBlank(cdnDomain)) {
            url = cdnDomain + url;
        }

        element.attr(attrName, url);
    }
}

Example 13

Source File: JSoupUtils.java From ZhiHu-TopAnswer with Apache License 2.0

4 votes

/**
 * 首页获取列表信息
 * @param document
 * @return
 */
public static List<TopicAnswers> getTopicList(Document document) {
    Elements contentLinks = document.select("div.content");
    List<TopicAnswers> list = new ArrayList<>();
    Iterator iterator = contentLinks.iterator();
    while (iterator.hasNext()) {
        TopicAnswers answers = new TopicAnswers();
        Element body = (Element) iterator.next();
        Elements questionLinks = body.select("a.question_link");
        if (questionLinks.iterator().hasNext()) {
            Element questionLink = questionLinks.iterator().next();
            answers.setTitle(questionLink.text());
            answers.setUrl("https://www.zhihu.com" + questionLink.attr("href"));
        }


        Elements votes = body.select("a.zm-item-vote-count.js-expand.js-vote-count");
        if (votes.size() > 0) {
            if (votes.iterator().hasNext()) {
                Element aVotes = votes.iterator().next();
                answers.setVote(aVotes.text());
            }
        }

        Elements divs = body.select("div.zh-summary.summary.clearfix");

        String descBody = divs.text();
        if (descBody.length() > 4) {
            descBody = descBody.substring(0, descBody.length() - 4);
        }
        answers.setBody(descBody);
        if (divs.size() > 0) {
            if (divs.iterator().hasNext()) {
                Element aDiv = divs.iterator().next();

                Element img = aDiv.children().first();
                if (img.tagName().equals("img")) {
                    String imgUrl = img.attr("src");
                    answers.setImg(imgUrl);
                }
            }
        }
        if (!TextUtils.isEmpty(answers.getTitle()) && !TextUtils.isEmpty(answers.getUrl())) {
            L.i(TAG, answers.toString());
            list.add(answers);
        }
    }
    return list;
}

Example 14

Source File: HeadingsHierarchyChecker.java From Asqatasun with GNU Affero General Public License v3.0

4 votes

/**
 * This methods checks whether the headings hierarchy is well-structured
 *
 * @param elements
 * @param testSolutionHandler
 */
private void checkHeadingsHierarchy(
        Elements elements,
        TestSolutionHandler testSolutionHandler) {
    if (elements.isEmpty()) {
        testSolutionHandler.addTestSolution(TestSolution.NOT_APPLICABLE);
        return;
    }

    TestSolution checkResult = TestSolution.PASSED;

    Iterator<Element> iter = elements.iterator();

    // we get the index of the first element for further test
    Element element = iter.next();
    int indexOfReference = getHeaderIndex(element);
    int currentIndex;
    int previousIndex = indexOfReference;
    Element elementOfReference = element;
    Element previousElement = element;

    while (iter.hasNext()) {
        element = iter.next();
        currentIndex = getHeaderIndex(element);
        if (currentIndex != -1) {
            if (currentIndex - previousIndex >= 2) {
                checkResult = TestSolution.FAILED;
                addSourceCodeRemark(
                        TestSolution.FAILED,
                        element,
                        HEADER_NOT_HIERARCHICALLY_WELL_DEFINED_MSG,
                        getEvidenceElement(PREVIOUS_H_TAG_INDEX_EE, getEvidenceElementMsg(previousIndex, previousElement)));
            } else if (currentIndex < indexOfReference) {
                checkResult = TestSolution.FAILED;
                addSourceCodeRemark(
                        TestSolution.FAILED,
                        element,
                        HEADER_NOT_HIERARCHICALLY_WELL_DEFINED_MSG,
                        getEvidenceElement(FIRST_H_TAG_INDEX_EE, getEvidenceElementMsg(indexOfReference, elementOfReference)));
            }
            previousIndex = currentIndex;
            previousElement = element;
        }
    }
    testSolutionHandler.addTestSolution(checkResult);
}

Example 15

Source File: FileReader.java From calcite with Apache License 2.0

4 votes

FileReaderIterator(Elements rows) {
  this.rowIterator = rows.iterator();
}

Example 16

Source File: BakaTsukiParserAlternative.java From coolreader with MIT License

4 votes

private static void parseNovelChapters(Document doc, NovelCollectionModel novel, String language) {
	// Log.d(TAG, "Start parsing book collections for " + novel.getPage());
	// parse the collection
	ArrayList<BookModel> books = new ArrayList<BookModel>();
	boolean oneBookOnly = false;
	ArrayList<String> parser = null;
	if (language != null)
		parser = AlternativeLanguageInfo.getAlternativeLanguageInfo().get(language).getParserInfo();
	try {
		Elements h2s = doc.select("h1,h2");
		for (Iterator<Element> i = h2s.iterator(); i.hasNext();) {
			Element h2 = i.next();
			// Log.d(TAG, "checking h2: " +h2.text() + "\n" + h2.id());
			Elements spans = h2.select("span");
			if (spans.size() > 0) {
				// find span with id containing "_by" or 'Full_Text'
				// or contains with Page Name or "Side_Stor*" or "Short_Stor*"
				// or contains "_Series" (Maru-MA)
				// or if redirected, use the redirect page name.
				boolean containsBy = false;
				for (Iterator<Element> iSpan = spans.iterator(); iSpan.hasNext();) {
					Element s = iSpan.next();
					Log.d(TAG, "Checking: " + s.id());
					boolean tempBool = false;
					for (int j = 0; j < parser.size(); j++)
						if (s.id().contains(parser.get(j)))
							tempBool = true;
					if (tempBool || s.id().contains(novel.getPage()) || (novel.getRedirectTo() != null && s.id().contains(novel.getRedirectTo()))) {
						containsBy = true;
						Log.d(TAG, "Got valid id: " + s.id());
						break;
					}
					Log.d(TAG, "Not valid id: " + s.id());
				}
				if (!containsBy) {
					continue;
				}

				// Log.d(TAG, "Found h2: " +h2.text());
				ArrayList<BookModel> tempBooks = parseBooksMethod1(novel, h2, language);
				if (tempBooks != null && tempBooks.size() > 0) {
					books.addAll(tempBooks);
				}
				if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
					Log.d(TAG, "No books found, use method 2: Only have 1 book, chapter in <p> tag.");
					tempBooks = parseBooksMethod2(novel, h2, language);
					if (tempBooks != null && tempBooks.size() > 0) {
						oneBookOnly = true;
						books.addAll(tempBooks);
					}
				}
				if (books.size() == 0 || (oneBookOnly && tempBooks.size() == 0)) {
					Log.d(TAG, "No books found, use method 3: Only have 1 book.");
					tempBooks = parseBooksMethod3(novel, h2, language);
					if (tempBooks != null && tempBooks.size() > 0) {
						oneBookOnly = true;
						books.addAll(tempBooks);
					}
				}
			}
		}
	} catch (Exception e) {
		Log.e(TAG, "Unknown Exception for " + novel.getPage() + ": " + e.getMessage(), e);
	}
	// Log.d(TAG, "Complete parsing book collections: " + books.size());

	novel.setBookCollections(CommonParser.validateNovelBooks(books));
}