Java Code Examples for org.jsoup.nodes.Element#select()

The following examples show how to use org.jsoup.nodes.Element#select() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SpiderService.java    From Doctor with Apache License 2.0 6 votes vote down vote up
/**
 * 获取疾病属性
 * @param attr
 * @return
 * @throws IOException
 */
private List<Propertiy> queryPropertiys(String attr){
    List<Propertiy> propertiyList = new ArrayList<>();
    Propertiy propertiy;
    Document document = getDocument(attr);
    Elements elements = document.select(div_content);
    for (Element element : elements){
        propertiy = new Propertiy();
        if (element.select(span_con)!=null && element.select(span_con).first()!=null) {
            propertiy.setName(element.select(span_con).first().text());
        }else{
            propertiy.setName(p_default);
        }
        //爬取属性
        propertiy.setId(-1);
        propertiy.setNumber(propertiyList.size()+1);
        propertiy.setValue(getValue(element));
        propertiyList.add(propertiy);
    }
    return propertiyList;
}
 
Example 2
Source File: LeaveOneOutCV.java    From NLIWOD with GNU Affero General Public License v3.0 6 votes vote down vote up
public static ArrayList<String> loadSystemP(String system){

		Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html");
		ArrayList<String> result = Lists.newArrayList();

		try{
			String loadedData = Files.lines(datapath).collect(Collectors.joining()); 
			Document doc = Jsoup.parse(loadedData);
			Element table = doc.select("table").get(5);
			Elements tableRows = table.select("tr");
			for(Element row: tableRows){
				Elements tableEntry = row.select("td");
				result.add(tableEntry.get(2).ownText());
			}
			result.remove(0); //remove the head of the table
			return result;
		}catch(IOException e){
			e.printStackTrace();
			log.debug("loading failed.");
			return result;
		}
	}
 
Example 3
Source File: HiParser.java    From hipda with GNU General Public License v2.0 6 votes vote down vote up
private static SimpleListItemBean parseFriendInfo(Element root) {
    SimpleListItemBean item = new SimpleListItemBean();
    item.setTitle("好友信息");
    Elements aES = root.select("a");
    if (aES.size() > 0) {
        String uid = Utils.getMiddleString(aES.first().attr("href"), "uid=", "&");
        item.setAvatarUrl(HiUtils.getAvatarUrlByUid(uid));
        item.setUid(uid);
        item.setAuthor(aES.first().text());
    }
    // new
    Elements imgES = root.select("img");
    if (imgES.size() > 0) {
        if (imgES.first().attr("src").contains(HiUtils.NewPMImage)) {
            item.setNew(true);
        }
    }
    //remove add friend link/text
    if (aES.size() > 1) {
        aES.get(1).remove();
    }
    item.setInfo(root.text());
    return item;
}
 
Example 4
Source File: CDTClassifierEvaluation.java    From NLIWOD with GNU Affero General Public License v3.0 6 votes vote down vote up
public static ArrayList<String> loadSystemR(String system){
	Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html");
	ArrayList<String> result = Lists.newArrayList();

	try{
		String loadedData = Files.lines(datapath).collect(Collectors.joining()); 
		Document doc = Jsoup.parse(loadedData);
		Element table = doc.select("table").get(5);
		Elements tableRows = table.select("tr");
		for(Element row: tableRows){
			Elements tableEntry = row.select("td");
			result.add(tableEntry.get(1).ownText());
		}
		result.remove(0); //remove the head of the table
		return result;
	}catch(IOException e){
		e.printStackTrace();
		log.debug("loading failed.");
		return result;
	}
}
 
Example 5
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void descendant() {
    String h = "<div class=head><p class=first>Hello</p><p>There</p></div><p>None</p>";
    Document doc = Jsoup.parse(h);
    Element root = doc.getElementsByClass("head").first();
    
    Elements els = root.select(".head p");
    assertEquals(2, els.size());
    assertEquals("Hello", els.get(0).text());
    assertEquals("There", els.get(1).text());

    Elements p = root.select("p.first");
    assertEquals(1, p.size());
    assertEquals("Hello", p.get(0).text());

    Elements empty = root.select("p .first"); // self, not descend, should not match
    assertEquals(0, empty.size());
    
    Elements aboveRoot = root.select("body div.head");
    assertEquals(0, aboveRoot.size());
}
 
Example 6
Source File: MovieServiceImpl.java    From albert with MIT License 5 votes vote down vote up
@Override
	public List<Movie> getPage(PageInfo page) {
		Document doc=null;
		try {
			doc = getConnect(getPageUrl(page.getCurPageNo())).get();
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
		Elements els = doc.select("#post_container > li");
		if(els != null){
			List<Movie> Movies = new ArrayList<>(); 
			for(int i = 0;i<els.size();i++){
				Element li = els.get(i);
				Elements as = li.select(".thumbnail a");
				for(Element a:as){
					String href = a.attr("href");
					Elements img = a.select(" > img");
					String imgUrl = img.attr("src");
					String title = a.attr("title");
					if(!checkHave(title)){
						Movie vo = getRecord(href);
						vo.setName(title);
						vo.setImg(imgUrl);
						movieMapper.addMovie(vo);
						logger.info("插入:"+vo.getName());
						Movie vi = movieMapper.getMovieDetailbyName(title);
						vi.setReviewNum(0);
						vi.setAddTime(new Date());
						vi.setUpdateTime(new Date());
						super.sendMessage(vi, Constants.Cache.Type.save);
						Movies.add(vo);
					}
				}
//				if(i==2)break;
			}
			return Movies;
		}
		return null;
	}
 
Example 7
Source File: EudicSentence.java    From ankihelper with GNU General Public License v3.0 5 votes vote down vote up
static String getSingleQueryResult(Element soup, String query, boolean toString){
    Elements re = soup.select(query);
    if(!re.isEmpty()){
        if(toString) {
            return re.get(0).toString();
        }
        else{
            return re.get(0).text();
        }
    }else{
        return "";
    }
}
 
Example 8
Source File: indianExpress.java    From Gazetti_Newspaper_Reader with MIT License 5 votes vote down vote up
private String getImageURL(Element bodyElement) {

        Elements mainImageElement = bodyElement.select(ConfigService.getInstance().getIndianExpressImage());
        if (mainImageElement.size() != 0) {
            mImageURL = mainImageElement.first().attr("src");
        }
        return mImageURL;

    }
 
Example 9
Source File: AppsGamesCatalogApi.java    From 4pdaClient-plus with Apache License 2.0 5 votes vote down vote up
public static ArrayList<Topic> loadCategoryThemes(IHttpClient client, String catalogId) throws IOException {
    String pageBody = client.performGet(APPS_CATALOG_URL).getResponseBody();
    ArrayList<Topic> res = new ArrayList<>();

    Pattern pattern = Pattern.compile("<a name=\"entry" + catalogId + "\">([\\s\\S]*?)</div>(?:<!--Begin Msg Number|<!-- TABLE FOOTER)", Pattern.CASE_INSENSITIVE);
    Matcher m = pattern.matcher(pageBody);
    if (!m.find()) return res;

    Document doc = Jsoup.parse(m.group(1));
    Elements subCategoryElements = doc.select("ol[type=1]");
    for (Element subCategoryElement : subCategoryElements) {
        String subCategoryTitle = "";
        Elements elements = subCategoryElement.select("span");
        if (elements.size() > 0) {
            subCategoryTitle = elements.first().text();
        }
        Elements topicElements = subCategoryElement.select("li");
        for (Element topicElement : topicElements) {
            elements = topicElement.select("a");
            if (elements.size() == 0) continue;

            Element element = elements.get(0);
            Uri uri = Uri.parse(element.attr("href"));

            Topic topic = new Topic(uri.getQueryParameter("showtopic"), element.text());
            m = Pattern.compile("</a>(?:\\s*</b>\\s*-\\s*)(.*)?(?:<br\\s*/>|$)", Pattern.CASE_INSENSITIVE).matcher(topicElement.html());
            if (m.find())
                topic.setDescription(m.group(1));
            topic.setForumTitle(subCategoryTitle);
            res.add(topic);
        }
    }


    return res;
}
 
Example 10
Source File: OutputFormatter.java    From JumpGo with Mozilla Public License 2.0 5 votes vote down vote up
private int append(Element node, StringBuilder sb, String tagName) {
    int countOfP = 0; // Number of P elements in the article
    int paragraphWithTextIndex = 0;
    // is select more costly then getElementsByTag?
    MAIN:
    for (Element e : node.select(tagName)) {
        Element tmpEl = e;
        // check all elements until 'node'
        while (tmpEl != null && !tmpEl.equals(node)) {
            if (unlikely(tmpEl))
                continue MAIN;
            tmpEl = tmpEl.parent();
        }

        String text = node2Text(e);
        if (text.isEmpty() || text.length() < getMinParagraph(paragraphWithTextIndex)
                || text.length() > SHelper.countLetters(text) * 2) {
            continue;
        }

        if (e.tagName().equals("p")) {
            countOfP++;
        }

        sb.append(text);
        sb.append("\n\n");
        paragraphWithTextIndex += 1;
    }

    return countOfP;
}
 
Example 11
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void deeperDescendant() {
    String h = "<div class=head><p><span class=first>Hello</div><div class=head><p class=first><span>Another</span><p>Again</div>";
    Document doc = Jsoup.parse(h);
    Element root = doc.getElementsByClass("head").first();

    Elements els = root.select("div p .first");
    assertEquals(1, els.size());
    assertEquals("Hello", els.first().text());
    assertEquals("span", els.first().tagName());

    Elements aboveRoot = root.select("body p .first");
    assertEquals(0, aboveRoot.size());
}
 
Example 12
Source File: SynonymDiscriminationExtractor.java    From superword with Apache License 2.0 5 votes vote down vote up
/**
 * 解析同义词辨析
 * @param html
 * @return
 */
public static Set<SynonymDiscrimination> parseSynonymDiscrimination(String html){
    Set<SynonymDiscrimination> data = new HashSet<>();
    try {
        for(Element element : Jsoup.parse(html).select(SYNONYM_DISCRIMINATION_CSS_PATH)){
            String title = element.select(TITLE).text().trim();
            Elements elements = element.select(DES);
            if(elements.size() != 2){
                LOGGER.error("解析描述信息出错,elements.size="+elements.size());
                continue;
            }
            String des = elements.get(0).text().replace("“ ”", "").replace("“ ", "“").trim();
            SynonymDiscrimination synonymDiscrimination = new SynonymDiscrimination();
            synonymDiscrimination.setTitle(title);
            synonymDiscrimination.setDes(des);
            elements = element.select(WORDS);
            for(Element ele : elements){
                String word = ele.text();
                String[] attr = word.split(":");
                if(attr != null && attr.length == 2){
                    synonymDiscrimination.addWord(new Word(attr[0].trim(), attr[1].trim()));
                }else {
                    LOGGER.error("解析词义信息出错:"+word);
                }
            }
            data.add(synonymDiscrimination);
            LOGGER.info("解析出同义词辨析:" + synonymDiscrimination);
        }
    }catch (Exception e){
        LOGGER.error("解析同义词辨析出错", e);
    }
    return data;
}
 
Example 13
Source File: MTGoldFishDashBoard.java    From MtgDesktopCompanion with GNU General Public License v3.0 5 votes vote down vote up
@Override
public List<CardDominance> getBestCards(MagicFormat.FORMATS f, String filter) throws IOException {

	// spells, creatures, all, lands
	
	String u = getString(WEBSITE) + "/format-staples/" + f.name().toLowerCase() + "/full/" + filter;
	
	if(f == MagicFormat.FORMATS.COMMANDER)
		u=getString(WEBSITE) + "/format-staples/commander_1v1/full/" + filter;
	
	Document doc = URLTools.extractHtml(u);

	logger.debug("get best cards : " + u);
	Elements trs = doc.select("table tr");
	trs.remove(0);
	trs.remove(0);
	List<CardDominance> ret = new ArrayList<>();
	for (Element e : trs) {
		Elements tds = e.select(MTGConstants.HTML_TAG_TD);
		try {
			int correct = filter.equalsIgnoreCase("lands") ? 1 : 0;

			CardDominance d = new CardDominance();
			d.setPosition(Integer.parseInt(tds.get(0).text()));
			d.setCardName(tds.get(1).text());
			d.setDecksPercent(Double.parseDouble(tds.get(3 - correct).text().replaceAll("\\%", "")));
			d.setPlayers(Double.parseDouble(tds.get(4 - correct).text().replaceAll("\\%", "")));
			
			ret.add(d);
		} catch (Exception ex) {
			logger.error("Error parsing " + tds, ex);
		}

	}
	return ret;
}
 
Example 14
Source File: LessonsTool.java    From zhangshangwuda with Apache License 2.0 4 votes vote down vote up
public static List<Map<String, String>> getLessonsList(Context context,
		String html) {
	Document doc = null;
	thtml = html;
	if (StringUtils.isEmpty(html)) {
		return null;
	}
	doc = Jsoup.parse(thtml);
	if (doc == null) {
		return null;
	}

	List<Map<String, String>> list = new ArrayList<Map<String, String>>();
	Elements lessons = doc.select("tr[align=center]");
	for (Element lesson : lessons) {
		Elements times = lesson.select("td[width=113]");
		int weekday = 0;
		for (Element time : times) {
			String tinfo = time.text();
			if (tinfo.length() < 2) {
				++weekday;
				continue;
			} else {
				Map<String, String> map = new HashMap<String, String>();
				Integer tid = LessonsSharedPreferencesTool
						.getLessonsId(context);
				++tid;
				LessonsSharedPreferencesTool.setLessonsId(context, tid);
				// 设置课程ID
				map.put("id", String.valueOf(tid));
				// 提取课程名
				map.put("name", lesson.select("td[width=80]").text());
				// 提取教师名
				map.put("teacher", lesson.select("td[width=52]").text());
				// 提取第几星期上课
				++weekday;
				map.put("day", Integer.toString(weekday));
				// 提取起止周数
				int tpos = tinfo.indexOf("周");
				map.put("ste", tinfo.substring(0, tpos));
				// 提取每几周
				tinfo = tinfo.substring(tpos + 3);
				map.put("mjz", tinfo.substring(0, 1));
				// 提取第几节上课
				tinfo = tinfo.substring(4);
				tpos = tinfo.indexOf("节");
				map.put("time", tinfo.substring(0, tpos));
				// 提取上课地点
				if (tinfo.length() > tpos + 2) {
					tinfo = tinfo.substring(tpos + 2);
					map.put("place", tinfo.substring(0));
				} else {
					map.put("place", "");
				}
				// 提取备注信息
				map.put("other", lesson.select("td[width=100]").text());
				list.add(map);
			}
		}
	}
	return list;
}
 
Example 15
Source File: Utils.java    From SteamGifts with MIT License 4 votes vote down vote up
/**
 * Loads giveaways from a list page.
 * <p>This is not suitable for loading individual giveaway instances from the featured list, as the HTML layout differs (see {@link LoadGiveawayDetailsTask#loadGiveaway(Document, Uri)}</p>
 *
 * @param document the loaded document
 * @return list of giveaways
 */
public static List<Giveaway> loadGiveawaysFromList(Document document) {
    Elements giveaways = document.select(".giveaway__row-inner-wrap");

    List<Giveaway> giveawayList = new ArrayList<>();
    for (Element element : giveaways) {
        // Basic information
        Element link = element.select("h2 a").first();

        Giveaway giveaway = null;
        if (link.hasAttr("href")) {
            Uri linkUri = Uri.parse(link.attr("href"));
            String giveawayLink = linkUri.getPathSegments().get(1);
            String giveawayName = linkUri.getPathSegments().get(2);

            giveaway = new Giveaway(giveawayLink);
            giveaway.setName(giveawayName);
        } else {
            giveaway = new Giveaway(null);
            giveaway.setName(null);
        }

        giveaway.setTitle(link.text());
        giveaway.setCreator(element.select(".giveaway__username").text());

        // Entries, would usually have comment count too... but we don't display that anywhere.
        Elements links = element.select(".giveaway__links a span");
        giveaway.setEntries(parseInt(links.first().text().split(" ")[0]));

        giveaway.setEntered(element.hasClass("is-faded"));

        // More details
        Elements icons = element.select("h2 a");
        Element icon = icons.size() < 2 ? null : icons.get(icons.size() - 2);
        Uri uriIcon = icon == link || icon == null ? null : Uri.parse(icon.attr("href"));

        Utils.loadGiveaway(giveaway, element, "giveaway", "giveaway__heading__thin", uriIcon);
        giveawayList.add(giveaway);
    }

    return giveawayList;
}
 
Example 16
Source File: FormatUtil.java    From wlmedia with Apache License 2.0 4 votes vote down vote up
public static String formatHomePage(String html, String uri)
{

    StringBuffer buffer = new StringBuffer();
    buffer.append("[");
    Document document = Jsoup.parse(html);
    Elements elements = document.getElementsByClass("bx-sya");
    int size = elements.size();
    for(int i = 0; i < size; i++)
    {
        Element element = elements.get(i);
        Elements hd = element.getElementsByClass("hd");
        Element hd1 = hd.get(0);
        buffer.append("{\"type\":\"");
        buffer.append(hd1.text());
        buffer.append("\",\"values\":[");
        Element bd = element.getElementsByClass("bd").get(0);
        Elements a = bd.select("a");
        int s = a.size();
        for(int j = 0; j < s; j++)
        {
            Element aa = a.get(j);
            buffer.append("{\"name\":\"");
            buffer.append(aa.select("i").text());
            buffer.append("\",\"url\":\"");
            buffer.append(uri);
            buffer.append(aa.attr("href"));
            buffer.append("\"");
            if(j == s - 1)
            {
                buffer.append("}");
            }
            else
            {
                buffer.append("},");
            }
        }
        if(i == size - 1)
        {
            buffer.append("]}");
        }
        else
        {
            buffer.append("]},");
        }
    }
    buffer.append("]");
    return buffer.toString();
}
 
Example 17
Source File: ParseV9PronVideo.java    From v9porn with MIT License 4 votes vote down vote up
private static List<V9PornItem> parserByDivContainer(Element container) {
    List<V9PornItem> v9PornItemList = new ArrayList<>();
    Elements select = container.select("div.row>div.col-sm-12>div.row>div");

    for (Element item : select) {
        Element a = item.selectFirst("a");
        if (a == null) {
            continue;
        }
        V9PornItem v9PornItem = new V9PornItem();

        String title = a.getElementsByClass("video-title").first().text().trim();
        v9PornItem.setTitle(title);

        Element imgEle = a.selectFirst("img.img-responsive");
        if (imgEle != null) {
            v9PornItem.setImgUrl(imgEle.attr("src"));
        }

        Element durationEle = a.selectFirst("span.duration");
        if (durationEle != null) {
            v9PornItem.setDuration(durationEle.text().trim());
        } else {
            v9PornItem.setDuration("00:00");
        }


        String contentUrl = a.attr("href");

        String viewKey = contentUrl.substring(contentUrl.indexOf("?") + 1);
        v9PornItem.setViewKey(viewKey);

        String allInfo = item.text();

        // Added: / 添加時間: / 添加时间:

        int start = allInfo.indexOf("添加时间:");
        if (start == -1) {
            start = allInfo.indexOf("Added:");
            if (start == -1) {
                start = allInfo.indexOf("添加時間:");
            }
        }

        String info = allInfo.substring(start);
        try {
            if (TextUtils.equals(v9PornItem.getDuration(), "00:00")) {
                String duration = allInfo.substring(allInfo.indexOf("时长:") + 3, allInfo.indexOf("查看"));
                v9PornItem.setDuration(duration);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        v9PornItem.setInfo(info);
        // Logger.d(info);
        v9PornItemList.add(v9PornItem);

    }


    return v9PornItemList;
}
 
Example 18
Source File: OutputFormatter.java    From JumpGo with Mozilla Public License 2.0 4 votes vote down vote up
private static void setParagraphIndex(Element node, String tagName) {
    int paragraphIndex = 0;
    for (Element e : node.select(tagName)) {
        e.attr("paragraphIndex", Integer.toString(paragraphIndex++));
    }
}
 
Example 19
Source File: CssSelector.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Override
public List<Element> selectElements(Element element) {
    return element.select(selectorText);
}
 
Example 20
Source File: firstPost.java    From Gazetti_Newspaper_Reader with MIT License 3 votes vote down vote up
private String getImageURL(Element bodyElement) {
    Elements mainImageElement = bodyElement.select(ConfigService.getInstance().getFirstPostImage());

    if (mainImageElement.size() != 0) {
        mImageURL = mainImageElement.get(1).attr("src");
    }

    return mImageURL;

}