Java Code Examples for org.jsoup.nodes.Element#attr()

The following examples show how to use org.jsoup.nodes.Element#attr() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ImgurRipper.java    From ripme with MIT License 6 votes vote down vote up
/**
 * Rips all albums in an imgur user's account.
 * @param url
 *      URL to imgur user account (http://username.imgur.com)
 * @throws IOException
 */
private void ripUserAccount(URL url) throws IOException {
    LOGGER.info("Retrieving " + url);
    sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm());
    Document doc = Http.url(url).get();
    for (Element album : doc.select("div.cover a")) {
        stopCheck();
        if (!album.hasAttr("href")
                || !album.attr("href").contains("imgur.com/a/")) {
            continue;
        }
        String albumID = album.attr("href").substring(album.attr("href").lastIndexOf('/') + 1);
        URL albumURL = new URL("http:" + album.attr("href") + "/noscript");
        try {
            ripAlbum(albumURL, albumID);
            Thread.sleep(SLEEP_BETWEEN_ALBUMS * 1000);
        } catch (Exception e) {
            LOGGER.error("Error while ripping album: " + e.getMessage(), e);
        }
    }
}
 
Example 2
Source File: SyncFragment.java    From SteamGifts with MIT License 6 votes vote down vote up
@Override
protected String[] doInBackground(Void... params) {
    Log.d(TAG, "Fetching sync details");

    try {
        // Fetch the Giveaway page

        Connection jsoup = Jsoup.connect("https://www.steamgifts.com/account/profile/sync")
                .userAgent(Constants.JSOUP_USER_AGENT)
                .timeout(Constants.JSOUP_TIMEOUT)
                .cookie("PHPSESSID", SteamGiftsUserData.getCurrent(fragment.getContext()).getSessionId());
        Document document = jsoup.get();

        SteamGiftsUserData.extract(fragment.getContext(), document);

        // Fetch the xsrf token
        Element xsrfToken = document.select("input[name=xsrf_token]").first();
        Element lastSyncTime = document.select(".form__sync-data .notification").first();
        if (xsrfToken != null) {
            return new String[]{xsrfToken.attr("value"), lastSyncTime == null ? null : lastSyncTime.text()};
        }
    } catch (Exception e) {
        Log.e(TAG, "Error fetching URL", e);
    }
    return null;
}
 
Example 3
Source File: ArticleTextExtractor.java    From JumpGo with Mozilla Public License 2.0 5 votes vote down vote up
private int calcWeight(Element e) {
    int weight = 0;
    if (POSITIVE.matcher(e.className()).find())
        weight += 35;

    if (POSITIVE.matcher(e.id()).find())
        weight += 45;

    if (UNLIKELY.matcher(e.className()).find())
        weight -= 20;

    if (UNLIKELY.matcher(e.id()).find())
        weight -= 20;

    if (NEGATIVE.matcher(e.className()).find())
        weight -= 50;

    if (NEGATIVE.matcher(e.id()).find())
        weight -= 50;

    String style = e.attr("style");
    if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
        weight -= 50;

    String itemprop = e.attr("itemprop");
    if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) {
        weight += 100;
    }

    return weight;
}
 
Example 4
Source File: TextAttributeOfElementBuilderTest.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
     * Test of buildTextFromElement method, of class TextAttributeOfElementBuilder.
     */
    public void testBuildTextFromElementWithTargettedAttributeNotSet() {
        LOGGER.debug("buildTextFromElementWithTargettedAttributeNotSet");
        Element element = new Element(Tag.valueOf("div"), "");
        element.attr(AttributeStore.ALT_ATTR, "test");
        TextAttributeOfElementBuilder instance = new TextAttributeOfElementBuilder();
        String result = instance.buildTextFromElement(element);
        assertNull(result);
//        assertNull(instance.getAttributeName());
    }
 
Example 5
Source File: HtmlView.java    From JavaRushTasks with MIT License 5 votes vote down vote up
private String getUpdatedFileContent(List<Vacancy> vacancies) {

        Document document = null;
        try {
            document = getDocument();

            Element templateOriginal = document.getElementsByClass("template").first();
            Element copyTemplate = templateOriginal.clone();
            copyTemplate.removeAttr("style");
            copyTemplate.removeClass("template");
            document.select("tr[class=vacancy]").remove().not("tr[class=vacancy template");

            for (Vacancy vacancy : vacancies) {
                Element localClone = copyTemplate.clone();
                localClone.getElementsByClass("city").first().text(vacancy.getCity());
                localClone.getElementsByClass("companyName").first().text(vacancy.getCompanyName());
                localClone.getElementsByClass("salary").first().text(vacancy.getSalary());
                Element link =localClone.getElementsByTag("a").first();
                link.text(vacancy.getTitle());
                link.attr("href", vacancy.getUrl());

                templateOriginal.before(localClone.outerHtml());
            }
        } catch (IOException e) {
            e.printStackTrace();
            return "Some exception occurred";
        }
        return document.html();
    }
 
Example 6
Source File: IfanrHotProcessor.java    From hot-crawler with MIT License 5 votes vote down vote up
@Override
protected Info getInfoByElement(Element element) {
    element = element.getElementsByClass("js-title-transform").get(0);
    String infoUrl = element.attr("href");
    String infoTitle = element.html();
    return new Info(infoTitle, infoUrl);
}
 
Example 7
Source File: DefaultYoutubeTrackDetails.java    From lavaplayer with Apache License 2.0 5 votes vote down vote up
private List<YoutubeTrackFormat> loadTrackFormatsFromDashDocument(Document document) {
  List<YoutubeTrackFormat> tracks = new ArrayList<>();

  for (Element adaptation : document.select("AdaptationSet")) {
    String mimeType = adaptation.attr("mimeType");

    for (Element representation : adaptation.select("Representation")) {
      String url = representation.select("BaseURL").first().text();
      String contentLength = DataFormatTools.extractBetween(url, "/clen/", "/");
      String contentType = mimeType + "; codecs=" + representation.attr("codecs");

      if (contentLength == null) {
        log.debug("Skipping format {} because the content length is missing", contentType);
        continue;
      }

      tracks.add(new YoutubeTrackFormat(
          ContentType.parse(contentType),
          Long.parseLong(representation.attr("bandwidth")),
          Long.parseLong(contentLength),
          url,
          null,
          DEFAULT_SIGNATURE_KEY
      ));
    }
  }

  return tracks;
}
 
Example 8
Source File: SankakuComplexRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
public Document getNextPage(Document doc) throws IOException {
    Element pagination = doc.select("div.pagination").first();
    if (pagination.hasAttr("next-page-url")) {
        String nextPage = pagination.attr("abs:next-page-url");
        // Only logged in users can see past page 25
        // Trying to rip page 26 will throw a no images found error
        if (!nextPage.contains("page=26")) {
            LOGGER.info("Getting next page: " + pagination.attr("abs:next-page-url"));
            return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();
        }
    }
    throw new IOException("No more pages");
}
 
Example 9
Source File: OnnmyoujiSpider.java    From SpringBootUnity with MIT License 5 votes vote down vote up
/**
 * 获取御魂信息详情页连接
 */
private static List<String> getMitamaDetailInfoUrl() {
    List<String> list = new ArrayList<>();
    String html = HttpUtil.get(URL);
    Document doc = Jsoup.parse(html);
    Element select = doc.select(".heroList-1").get(0);
    Elements liElement = select.select("a");
    for (Element element : liElement) {
        String href = element.attr("href");
        list.add(href);
    }
    return list;
}
 
Example 10
Source File: HiParser.java    From hipda with GNU General Public License v2.0 5 votes vote down vote up
private static SimpleListItemBean parseNotifyThread(Element root) {
    SimpleListItemBean item = new SimpleListItemBean();
    String info = "";

    Elements aES = root.select("a");
    for (Element a : aES) {
        String href = a.attr("href");
        if (href.contains("space.php")) {
            // get replied usernames
            info += a.text() + " ";
        } else if (href.contains("redirect.php?")) {
            // Thread Name and TID and PID
            item.setTitle(a.text());
            item.setTid(Utils.getMiddleString(a.attr("href"), "ptid=", "&"));
            item.setPid(Utils.getMiddleString(a.attr("href"), "pid=", "&"));
            break;
        }
    }

    // time
    Elements emES = root.select("em");
    if (emES.size() == 0) {
        return null;
    }
    item.setTime(emES.first().text());

    if (root.text().contains("回复了您关注的主题"))
        info += "回复了您关注的主题";
    else
        info += "回复了您的帖子 ";

    item.setNew(true);
    item.setInfo(info);
    return item;
}
 
Example 11
Source File: FUN_CSSPath.java    From sparql-generate with Apache License 2.0 5 votes vote down vote up
private NodeValue selectAttribute(Element element, String selectPath, String attributeName) {
    Elements elements = element.select(selectPath);
    Element e = elements.first();
    if (e == null) {
        throw new ExprEvalException("No evaluation of " + element + ", " + selectPath);
    }
    if (!e.hasAttr(attributeName)) {
        throw new ExprEvalException("The evaluation of " + element + ", " + selectPath + " is an element that does not have attribute " + attributeName);
    }
    return new NodeValueString(e.attr(attributeName));
}
 
Example 12
Source File: CommonParser.java    From movienow with GNU General Public License v3.0 5 votes vote down vote up
private static String getTextWithoutOr(Element element, String lastRule) {
        String[] rules = lastRule.split("!");
        String text;
        if (rules.length > 1) {
            if (rules[0].equals("Text")) {
                text = element.text();
            } else if (rules[0].contains("Attr")) {
                text = element.attr(rules[0].replace("Attr", ""));
            } else {
                text = element.select(rules[0]).first().toString();
            }
            text = StringUtil.replaceBlank(text);
            for (int i = 1; i < rules.length; i++) {
                text = text.replace(rules[i], "");
            }
            return text;
        } else {
            if (lastRule.equals("Text")) {
                text = element.text();
            } else if (lastRule.contains("Attr")) {
                text = element.attr(lastRule.replace("Attr", ""));
            } else {
                text = element.attr(lastRule);
//                text = element.select(lastRule).first().toString();
            }
            return StringUtil.replaceBlank(text);
        }
    }
 
Example 13
Source File: SteamGiftsUserData.java    From SteamGifts with MIT License 5 votes vote down vote up
public static void extract(@Nullable Context context, @Nullable Document document) {
    if (getCurrent(context) == null)
        return;

    if (document == null)
        return;

    Elements navbar = document.select(".nav__button-container");

    Element userContainer = navbar.last().select("a").first();
    String link = userContainer.attr("href");

    if (link.startsWith("/user/")) {
        current.setName(link.substring(6));

        // fetch the image
        String style = userContainer.select("div").first().attr("style");
        style = Utils.extractAvatar(style);
        current.setImageUrl(style);

        // points
        Element accountContainer = navbar.select("a[href=/account]").first();
        current.setPoints(Utils.parseInt(accountContainer.select(".nav__points").text()));

        // Level
        float level = Float.parseFloat(accountContainer.select("span").last().attr("title"));
        current.setLevel((int) level);

        // Notifications
        Elements notifications = navbar.select(".nav__button-container--notification");
        current.setCreatedNotification(getInt(notifications.select("a[href=/giveaways/created]").first().text()));
        current.setWonNotification(getInt(notifications.select("a[href=/giveaways/won]").first().text()));
        current.setMessageNotification(getInt(notifications.select("a[href=/messages]").first().text()));
    } else if (link.startsWith("/?login") && current.isLoggedIn()) {
        current = new SteamGiftsUserData();
        if (context != null)
            current.save(context);
    }
}
 
Example 14
Source File: ImagearnRipper.java    From ripme with MIT License 5 votes vote down vote up
@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> imageURLs = new ArrayList<>();
    for (Element thumb : doc.select("div#gallery > div > a")) {
        String imageURL = thumb.attr("href");
        try {
            Document imagedoc = new Http("http://imagearn.com/" + imageURL).get();
            String image = imagedoc.select("a.thickbox").first().attr("href");
            imageURLs.add(image);
        } catch (IOException e) {
            LOGGER.warn("Was unable to download page: " + imageURL);
        }
    }
    return imageURLs;
}
 
Example 15
Source File: ResourceQuote.java    From templatespider with Apache License 2.0 5 votes vote down vote up
/**
 * 替换 img 标签
 * @param doc
 * @return
 */
public Document imgTag(Document doc){
	Elements imgElements = doc.getElementsByTag("img");
	for (int i = 0; i < imgElements.size(); i++) {
		Element e = imgElements.get(i);
		String url = e.attr("src");
		String absUrl = hierarchyReplace(this.baseUri, url);
		if(!url.equals(absUrl)){
			e.attr("src", absUrl);
		}
	}
	return doc;
}
 
Example 16
Source File: ParseSection.java    From schedge with MIT License 5 votes vote down vote up
public static SectionAttribute parse(@NotNull String rawData) {
  logger.debug("parsing raw catalog section data into SectionAttribute...");

  rawData = rawData.trim();

  if (rawData.equals("")) {
    logger.warn("Got bad data: empty string");
    return null; // the course doesn't exist
  }

  Document doc = Jsoup.parse(rawData);
  Element failed = doc.selectFirst("div.alert.alert-info");
  if (failed != null) {
    logger.warn("Got bad data: " + failed.text());
    return null; // the course doesn't exist
  }

  Elements elements = doc.select("a");
  String link = null;
  for (Element element : elements) {
    String el = element.attr("href");
    if (el.contains("mapBuilding")) {
      link = el;
    }
  }

  doc.select("a").unwrap();
  doc.select("i").unwrap();
  doc.select("b").unwrap();
  Element outerDataSection = doc.selectFirst("body > section.main");
  Element innerDataSection = outerDataSection.selectFirst("> section");
  Element courseNameDiv = innerDataSection.selectFirst("> div.primary-head");
  String courseName = courseNameDiv.text();
  Elements dataDivs =
      innerDataSection.select("> div.section-content.clearfix");
  Map<String, String> secData = parseSectionAttributes(dataDivs);

  return parsingElements(secData, courseName, link);
}
 
Example 17
Source File: JSoupBaiduSearcher.java    From search with Apache License 2.0 4 votes vote down vote up
@Override
public SearchResult search(String keyword, int page) {
    int pageSize = 10;
    //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数
    //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize
    String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;
    
    SearchResult searchResult = new SearchResult();
    searchResult.setPage(page);
    List<Webpage> webpages = new ArrayList<>();
    try {
        Document document = Jsoup.connect(url).get();
        
        //获取搜索结果数目
        int total = getBaiduSearchResultCount(document);
        searchResult.setTotal(total);
        int len = 10;
        if (total < 1) {
            return null;
        }
        //如果搜索到的结果不足一页
        if (total < 10) {
            len = total;
        }
        for (int i = 0; i < len; i++) {
            String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
            String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
            LOG.debug("titleCssQuery:" + titleCssQuery);
            LOG.debug("summaryCssQuery:" + summaryCssQuery);
            Element titleElement = document.select(titleCssQuery).first();
            String href = "";
            String titleText = "";
            if(titleElement != null){
                titleText = titleElement.text();
                href = titleElement.attr("href");
            }else{
                //处理百度百科
                titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
                summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
                LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);
                LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);
                titleElement = document.select(titleCssQuery).first();
                if(titleElement != null){
                    titleText = titleElement.text();
                    href = titleElement.attr("href");
                }
            }
            LOG.debug(titleText);
            Element summaryElement = document.select(summaryCssQuery).first();
            //处理百度知道
            if(summaryElement == null){
                summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
                LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);
                summaryElement = document.select(summaryCssQuery).first();
            }
            String summaryText = "";
            if(summaryElement != null){
                summaryText = summaryElement.text(); 
            }
            LOG.debug(summaryText);                
            
            if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
                Webpage webpage = new Webpage();
                webpage.setTitle(titleText);
                webpage.setUrl(href);
                webpage.setSummary(summaryText);
                if (href != null) {
                    String content = Tools.getHTMLContent(href);
                    webpage.setContent(content);
                } else {
                    LOG.info("页面正确提取失败");
                }
                webpages.add(webpage);
            } else {
                LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);
            }
        }
        
        
    } catch (IOException ex) {
        LOG.error("搜索出错",ex);
    }
    searchResult.setWebpages(webpages);;
    return searchResult;
}
 
Example 18
Source File: GetYAnswersPropertiesFromQid.java    From LiveQAServerDemo with MIT License 4 votes vote down vote up
@Override
public String getText(Element e) {
    return e.attr("content");
}
 
Example 19
Source File: GetReviewerInfo.java    From customer-review-crawler with The Unlicense 4 votes vote down vote up
public ArrayList<String> reviewer_info(String reviewerID) {
       System.out.println("Reviewer: " + reviewerID);
       String url = "http://www.amazon.com/gp/pdp/profile/" + reviewerID;
	String url2 = "http://www.amazon.com/gp/cdp/member-reviews/"
			+ reviewerID + "/?sort_by=MostRecentReview";
	Document doc = null;
	ArrayList<String> attributes = new ArrayList<String>();
	String Reviewer_ranking = "";
	String Total_helpful_votes = "";
	String Total_reviews = "1";
	String Location = "";
	List<String> Recent_rating = new ArrayList<>();
	try {
		doc = Jsoup.connect(url).header("User-Agent",
                   "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2").get();

           // reviewer ranking
		Elements Reviewer_ranking_e = doc.select("span.a-size-small:contains(Reviewer Ranking: #)");
           System.out.println(Reviewer_ranking_e);
           Pattern pattern = Pattern.compile("(Reviewer ranking: #)(\\S+)");
		Matcher matcher = pattern.matcher(Reviewer_ranking_e.text());
		if (matcher.find()) {
			Reviewer_ranking = matcher.group(2);
		}

           // review helpful votes
		Element total_vote = doc.select("span.a-size-small:contains(votes received on reviews)").first();
           if(total_vote != null){
               Element vote_parent = total_vote.parent();
               String votes_string = vote_parent.select("span:contains( of )").text();
               pattern = Pattern
                       .compile("([(])(\\S+)( of )(\\S+)([)])");
               matcher = pattern.matcher(votes_string);
               if(matcher.find()){
                   Total_helpful_votes = matcher.group(2) + " of "
                           + matcher.group(4);
               }
           }

           // total number of reviews
		Element Total_reviews_e = doc.select("div.reviews-link").first();
           if(Total_reviews_e != null){
               pattern = Pattern.compile("(Reviews [(])((\\S+))([)])");
               matcher = pattern.matcher(Total_reviews_e.text());
               if (matcher.find()) {
                   Total_reviews = matcher.group(2);
               }
           }

           // location of the reviewer (if listed)
           Element Location_e = doc.select("div.profile-name-container").first();
           if(Location_e.parent() != null)
               Location = Location_e.parent().text();


           //recent 10 ratings
		doc = Jsoup.connect(url2).get();
		Elements images = doc.select("img");
		for (Element image : images) {
			String imagealt = image.attr("alt");
               if (imagealt.contains("out of 5 stars")) {
                   Recent_rating.add(imagealt.substring(0, 1));
               }
           }

	} catch (IOException e) {
           System.out.println(e);
           System.out.println(reviewerID + " Removed");
		return (null);
	}

	if (Recent_rating.size() > 10) {
		Recent_rating = Recent_rating.subList(0, 10);
	} else {
		Total_reviews = Integer.toString(Recent_rating.size());
	}
	String Recent_rating_joined = org.apache.commons.lang.StringUtils.join(
			Recent_rating, " ");
	attributes.addAll(Arrays.asList(reviewerID, Total_reviews,
			Reviewer_ranking, Total_helpful_votes, Location,
			Recent_rating_joined.toString()));
	return (attributes);
}
 
Example 20
Source File: LotusNoirDecks.java    From MtgDesktopCompanion with GNU General Public License v3.0 4 votes vote down vote up
@Override
public List<RetrievableDeck> getDeckList() throws IOException {

	String decksUrl = getString(URL) + "?dpage=" + getString(MAX_PAGE) + "&action=" + getString(FORMAT);

	logger.debug("snif decks : " + decksUrl);

	int nbPage = getInt(MAX_PAGE);
	List<RetrievableDeck> list = new ArrayList<>();

	for (int i = 1; i <= nbPage; i++) {
		Document d = URLTools.extractHtml(getString(URL) + "?dpage=" + i + "&action=" + getString(FORMAT));

		Elements e = d.select("div.thumb_page");

		for (Element cont : e) {
			RetrievableDeck deck = new RetrievableDeck();
			Element info = cont.select("a").get(0);

			String name = info.attr("title").replace("Lien vers ", "").trim();
			String url = info.attr("href");
			String auteur = cont.select("small").select("a").text();
			Elements value = URLTools.extractHtml(url).select("span.card_title_us");
			StringBuilder deckColor = new StringBuilder();
			for (Element element : value)
			{
				String land = element.text().split(" ")[1];
				switch (land) 
				{
					case "Plain":
					case "Plains":
						deckColor.append("{W}");
						break;
					case "Island":
					case "Islands":
						deckColor.append("{U}");
						break;
					case "Swamp":
					case "Swamps":
						deckColor.append("{B}");
						break;
					case "Mountain":
					case "Mountains":
						deckColor.append("{R}");
						break;
					case "Forest":
					case "Forests":
						deckColor.append("{G}");
						break;
					default:
						break;
				} 
			}
			deck.setName(name);
			try {
				deck.setUrl(new URI(url));
			} catch (URISyntaxException e1) {
				deck.setUrl(null);
			}
			deck.setAuthor(auteur);
			deck.setColor(deckColor.toString());

			list.add(deck);
		}
	}
	return list;
}