Java Code Examples for org.jsoup.nodes.Element#absUrl()

The following examples show how to use org.jsoup.nodes.Element#absUrl() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PUBGNewsFetch.java    From Companion-For-PUBG-Android with MIT License 6 votes vote down vote up
@Override
protected Void doInBackground(final Void... voids) {

    final Document document;
    try {
        document = Jsoup.connect(PUBG_NEWS_LINK).get();
        for (final Element row : document.select(NEWS_DOCUMENT_SELECTOR)) {
            final String title = row.select(NEWS_TITLE_SELECTOR).text();
            final String type = row.select(NEWS_TYPE_SELECTOR).text();
            final String date = row.select(NEWS_DATE_SELECTOR).text();
            final String description = row.select(NEWS_DESCRIPTION_SELECTOR).text();
            final Element link = row.select(NEWS_LINK_SELECTOR).first();
            final String linkSrc = link.absUrl(NEWS_LINKSRC_SELECTOR);
            final Element img = row.select(NEWS_IMG_SELECTOR).first();
            final String imgSrc = img.absUrl(NEWS_IMGSRC_SELECTOR);

            publishProgress(new NewsItem(title, type, date, description, linkSrc, imgSrc));
        }
    } catch(IOException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 2
Source File: Whitelist.java    From jsoup-learning with MIT License 6 votes vote down vote up
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    if (value.length() == 0)
        value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
    if (!preserveRelativeLinks)
        attr.setValue(value);
    
    for (Protocol protocol : protocols) {
        String prot = protocol.toString() + ":";
        if (value.toLowerCase().startsWith(prot)) {
            return true;
        }
    }
    return false;
}
 
Example 3
Source File: AbstractDownloadableLinkRuleImplementation.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
protected void select(SSPHandler sspHandler) {
    super.select(sspHandler);
    Iterator<Element> iter = getElements().get().iterator();
    Element el;
    while (iter.hasNext()){
        el = iter.next();
        try {
            URI uri = new URI(el.absUrl(HREF_ATTR));
            if (isLinkWithProperExtension(uri)) {
                if (StringUtils.isNotBlank(uri.getFragment())) {
                    iter.remove();
                } else {
                    linkWithSimpleExtension.add(el);
                }
            }
        } catch (Exception ex){}
    }
}
 
Example 4
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 6 votes vote down vote up
/**
 * 解析相册
 *
 * @param responseBody
 * @return
 */
public static AlbumWrap parseAubum(String responseBody) {
    AlbumWrap albumWrap = new AlbumWrap();
    List<String> albums = new ArrayList<String>();

    Document document = Jsoup.parse(responseBody);
    document.setBaseUri(Constants.BASE_URL);
    Elements elements = document.getElementsByClass("postalbum_i");
    for (Element album : elements) {
        String url = album.absUrl("orig");
        albums.add(url);
    }
    albumWrap.setUrls(albums);

    String strCurpic = document.getElementById("curpic").text();
    int curpic = Integer.valueOf(strCurpic) - 1;
    albumWrap.setCurPosition(curpic);
    return albumWrap;
}
 
Example 5
Source File: LoadMessagesTask.java    From SteamGifts with MIT License 6 votes vote down vote up
private List<IEndlessAdaptable> loadMessages(Document document) {
    List<IEndlessAdaptable> list = new ArrayList<>();
    Elements children = document.select(".comments__entity");
    for (Element element : children) {
        Element link = element.select(".comments__entity__name a").first();
        if (link != null) {
            MessageHeader message = new MessageHeader(link.text(), link.absUrl("href"));

            Element commentElement = element.nextElementSibling();
            if (commentElement != null)
                Utils.loadComments(commentElement, message, Comment.Type.COMMENT);

            // add the message & all associated comments.
            list.add(message);
            list.addAll(message.getComments());
        }
    }

    return list;
}
 
Example 6
Source File: Whitelist.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    if (value.length() == 0)
        value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
    if (!preserveRelativeLinks)
        attr.setValue(value);
    
    for (Protocol protocol : protocols) {
        String prot = protocol.toString();

        if (prot.equals("#")) { // allows anchor links
            if (isValidAnchor(value)) {
                return true;
            } else {
                continue;
            }
        }

        prot += ":";

        if (lowerCase(value).startsWith(prot)) {
            return true;
        }
    }
    return false;
}
 
Example 7
Source File: SishuokWhitelist.java    From es with Apache License 2.0 5 votes vote down vote up
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // resolve relative urls to abs, and update the attribute so output html has abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    attr.setValue(value);

    for (Protocol protocol : protocols) {
        String prot = protocol.toString() + ":";
        if (value.toLowerCase().startsWith(prot)) {
            return true;
        }
    }
    return false;
}
 
Example 8
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 5 votes vote down vote up
/**
 * 解析引用回复的准备数据
 *
 * @param responseBody
 * @return
 */
public static PrepareQuoteReply parsePrepareQuoteReply(String responseBody) {
    PrepareQuoteReply quoteReply = new PrepareQuoteReply();
    try {

        Document document = Jsoup.parse(responseBody);
        document.setBaseUri(Constants.BASE_URL);

        Element postform = document.getElementById("postform");
        String url = postform.absUrl("action");

        String formhash = postform.getElementsByAttributeValue("name", "formhash").first().attr("value");
        String posttime = postform.getElementsByAttributeValue("name", "posttime").first().attr("value");
        String noticeauthor = postform.getElementsByAttributeValue("name", "noticeauthor").first().attr("value");
        String noticetrimstr = postform.getElementsByAttributeValue("name", "noticetrimstr").first().attr("value");
        String noticeauthormsg = postform.getElementsByAttributeValue("name", "noticeauthormsg").first().attr("value");
        String reppid = postform.getElementsByAttributeValue("name", "reppid").first().attr("value");
        String reppost = postform.getElementsByAttributeValue("name", "reppost").first().attr("value");
        String quoteBody = postform.getElementsByTag("blockquote").first().toString();

        quoteReply.setNoticeauthor(noticeauthor);
        quoteReply.setNoticeauthormsg(noticeauthormsg);
        quoteReply.setNoticetrimstr(noticetrimstr);
        quoteReply.setPosttime(posttime);
        quoteReply.setQuoteBody(quoteBody);
        quoteReply.setReppid(reppid);
        quoteReply.setUrl(url);
        quoteReply.setFormhash(formhash);
        quoteReply.setReppost(reppost);
    } catch (Exception e) {
        e.printStackTrace();
    }

    return quoteReply;
}
 
Example 9
Source File: HtmlTreeBuilder.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
void maybeSetBaseUri(Element base) {
    if (baseUriSetFromDoc) // only listen to the first <base href> in parse
        return;

    String href = base.absUrl("href");
    if (href.length() != 0) { // ignore <base target> etc
        baseUri = href;
        baseUriSetFromDoc = true;
        doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
    }
}
 
Example 10
Source File: Whitelist.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    if (value.length() == 0)
        value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
    if (!preserveRelativeLinks)
        attr.setValue(value);
    
    for (Protocol protocol : protocols) {
        String prot = protocol.toString();

        if (prot.equals("#")) { // allows anchor links
            if (isValidAnchor(value)) {
                return true;
            } else {
                continue;
            }
        }

        prot += ":";

        if (lowerCase(value).startsWith(prot)) {
            return true;
        }
    }
    return false;
}
 
Example 11
Source File: HtmlTreeBuilder.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
void maybeSetBaseUri(Element base) {
    if (baseUriSetFromDoc) // only listen to the first <base href> in parse
        return;

    String href = base.absUrl("href");
    if (href.length() != 0) { // ignore <base target> etc
        baseUri = href;
        baseUriSetFromDoc = true;
        doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
    }
}
 
Example 12
Source File: Free_yitianjianssCrawlerServiceImpl.java    From ShadowSocks-Share with Apache License 2.0 5 votes vote down vote up
/**
 * 网页内容解析 ss 信息
 *
 * @param document
 */
@Override
protected Set<ShadowSocksDetailsEntity> parse(Document document) {
	Elements ssList = document.select("div.image > img");

	Set<ShadowSocksDetailsEntity> set = new HashSet<>(ssList.size());

	for (int i = 0; i < ssList.size(); i++) {
		try {
			Element element = ssList.get(i);
			// 取 src 信息
			String src = element.absUrl("src");

			ShadowSocksDetailsEntity ss = parseURL(src);
			ss.setValid(false);
			ss.setValidTime(new Date());
			ss.setTitle(document.title());
			ss.setRemarks(TARGET_URL);
			ss.setGroup("ShadowSocks-Share");

			// 测试网络
			if (isReachable(ss))
				ss.setValid(true);

			// 无论是否可用都入库
			set.add(ss);

			log.debug("*************** 第 {} 条 ***************{}{}", i + 1, System.lineSeparator(), ss);
			// log.debug("{}", ss.getLink());
		} catch (Exception e) {
			log.error(e.getMessage(), e);
		}
	}
	return set;
}
 
Example 13
Source File: Whitelist.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    if (value.length() == 0)
        value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
    if (!preserveRelativeLinks)
        attr.setValue(value);
    
    for (Protocol protocol : protocols) {
        String prot = protocol.toString();

        if (prot.equals("#")) { // allows anchor links
            if (isValidAnchor(value)) {
                return true;
            } else {
                continue;
            }
        }

        prot += ":";

        if (value.toLowerCase().startsWith(prot)) {
            return true;
        }
    }
    return false;
}
 
Example 14
Source File: StructuralAnnotations.java    From baleen with Apache License 2.0 5 votes vote down vote up
private Structure createAnchor(final JCas jCas, final Element element) {
  String href = element.absUrl("href");
  if (Strings.isNullOrEmpty(href)) {
    href = element.attr("href");
  }
  if (!Strings.isNullOrEmpty(href)) {
    final Link l = new Link(jCas);
    l.setTarget(href);
    return l;
  } else {
    return new Anchor(jCas);
  }
}
 
Example 15
Source File: HtmlConverter.java    From docx4j-template with Apache License 2.0 5 votes vote down vote up
/**
 * 将页面转为{@link org.jsoup.nodes.Document}对象,xhtml 格式
 *
 * @param url
 * @return
 * @throws Exception
 */
protected Document url2xhtml(String url) throws Exception {
    Document doc = Jsoup.connect(url).get(); //获得

    if (logger.isDebugEnabled()) {
        logger.debug("baseUri: {}", doc.baseUri());
    }

    for (Element script : doc.getElementsByTag("script")) { //除去所有 script
        script.remove();
    }

    for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick,href 属性
        a.removeAttr("onclick");
        a.removeAttr("href");
    }

    Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址
    for (Element element : links) {
        String href = element.absUrl("href");

        if (logger.isDebugEnabled()) {
            logger.debug("href: {} -> {}", element.attr("href"), href);
        }

        element.attr("href", href);
    }

    doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式

    if (logger.isDebugEnabled()) {
        String[] split = doc.html().split("\n");
        for (int c = 0; c < split.length; c++) {
            logger.debug("line {}:\t{}", c + 1, split[c]);
        }
    }
    return doc;
}
 
Example 16
Source File: VidbleRipper.java    From ripme with MIT License 5 votes vote down vote up
private static List<String> getURLsFromPageStatic(Document doc) {
     List<String> imageURLs = new ArrayList<>();
     Elements els = doc.select("#ContentPlaceHolder1_divContent");
     Elements imgs = els.select("img");
     for (Element img : imgs) {
         String src = img.absUrl("src");
         src = src.replaceAll("_[a-zA-Z]{3,5}", "");

         if (!src.equals("")) {
             imageURLs.add(src);
         }
     }
     return imageURLs;
}
 
Example 17
Source File: HtmlParser.java    From gecco with MIT License 4 votes vote down vote up
public String $image(Element img, String attr) {
	if (img == null) {
		return null;
	}
	return img.absUrl(attr);
}
 
Example 18
Source File: CurseCrawler.java    From TinkerTime with GNU General Public License v3.0 4 votes vote down vote up
@Override
public URL getImageUrl() throws IOException {
	Document mainPage = getPage(getApiUrl());
	Element ele = mainPage.select("img.primary-project-attachment").first();
	return new URL(ele.absUrl("src"));
}
 
Example 19
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 4 votes vote down vote up
/**
 * 解析板块列表
 *
 * @param content
 * @return
 */
public static List<PlateGroup> parsePlateGroupList(String content) {
    List<PlateGroup> groups = new ArrayList<PlateGroup>();
    Document document = Jsoup.parse(content);
    document.setBaseUri(Constants.BASE_URL);
    Elements elementsGroup = document.getElementsByClass("bm");
    for (Element bm : elementsGroup) {
        PlateGroup plateGroup = new PlateGroup();

        Element bm_h = bm.getElementsByClass("bm_h").first();
        String title = bm_h.text();
        plateGroup.setTitle(title);
        List<Plate> plates = new ArrayList<Plate>();
        Elements plateElements = bm.getElementsByClass("bm_c");

        for (Element bm_c : plateElements) {
            Plate plate = new Plate();
            //链接,第一个是版块链接,如果有第二个则是删除收藏连接
            Elements as = bm_c.getElementsByTag("a");
            Element a1 = as.first();
            String plateTitle = a1.text();
            String url = a1.absUrl("href");
            Elements count = bm_c.getElementsByClass("xg1");
            String xg1 = null;
            if (count.size() != 0) {
                xg1 = count.first().text();
            } else {
                xg1 = "(0)";
            }

            //判断是否收藏
            String favoriteId = null;
            if (as.size() > 1) {
                String urlDelete = as.get(1).absUrl("href");
                favoriteId = new UrlParamsMap(urlDelete).get("favid");
            }

            plate.setTitle(plateTitle);
            plate.setUrl(url);
            plate.setXg1(xg1);
            plate.setFavoriteId(favoriteId);
            plates.add(plate);

        }

        plateGroup.setPlates(plates);
        groups.add(plateGroup);
    }

    return groups;
}
 
Example 20
Source File: HtmlParser.java    From gecco with MIT License 4 votes vote down vote up
public String $href(Element href, String attr) {
	if (href == null) {
		return null;
	}
	return href.absUrl(attr);
}