Java Code Examples for org.jsoup.nodes.Document#setBaseUri()

The following examples show how to use org.jsoup.nodes.Document#setBaseUri() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: XHTMLDocumentHandler.java    From docx4j-template with Apache License 2.0 6 votes vote down vote up
/**
 * Jsoup.parse(String url, int timeoutMillis)
 * Jsoup.connect(String url) 方法创建一个新的 Connection, 和  post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误,便会抛出 IOException,应适当处理。
 * 这两个方法只支持Web URLs (http和https 协议); 
 */
@Override
public Document handle(String url, DataMap dataMap) throws IOException{
	//获取Jsoup参数
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	String userAgent = "Mozilla/5.0 (jsoup)";
	int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
	//fetch the specified URL and parse to a HTML DOM
	Document doc = Jsoup.connect(url)
			  .data(dataMap.getData1())
			  .data(dataMap.getData2())
			  .userAgent(userAgent)
			  .cookies(dataMap.getCookies())
			  .timeout(timeout)
			  .post();
	doc.setBaseUri(baseUri);
	//返回Document对象
	return doc;
}
 
Example 2
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 6 votes vote down vote up
/**
 * 解析用户信息
 *
 * @param responseBody
 * @return
 */
public static User parseUserInfo(String responseBody) {
    User user = new User();
    try {
        Document document = Jsoup.parse(responseBody);
        document.setBaseUri(Constants.BASE_URL);
        Element elementUser = document.getElementsByClass("userinfo").first();
        Element elementAvatar = elementUser.getElementsByTag("img").first();
        user.setAvatarUrl(elementAvatar.attr("src"));
        user.setName(elementUser.getElementsByClass("name").first().text());
        user.setInfo(elementUser.getElementsByClass("user_box").html());

        Element btn_exit = document.getElementsByClass("btn_exit").first();

        String url = btn_exit.child(0).attr("href");
        UrlParamsMap map = new UrlParamsMap(url);
        String formHash = map.get("formhash");

        user.setFormHash(formHash);
        LogMessage.i("formHash", formHash);
    } catch (Exception e) {
        LogMessage.w(TAG + "#parseUserInfo", e);
    }
    return user;
}
 
Example 3
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 6 votes vote down vote up
/**
 * 解析相册
 *
 * @param responseBody
 * @return
 */
public static AlbumWrap parseAubum(String responseBody) {
    AlbumWrap albumWrap = new AlbumWrap();
    List<String> albums = new ArrayList<String>();

    Document document = Jsoup.parse(responseBody);
    document.setBaseUri(Constants.BASE_URL);
    Elements elements = document.getElementsByClass("postalbum_i");
    for (Element album : elements) {
        String url = album.absUrl("orig");
        albums.add(url);
    }
    albumWrap.setUrls(albums);

    String strCurpic = document.getElementById("curpic").text();
    int curpic = Integer.valueOf(strCurpic) - 1;
    albumWrap.setCurPosition(curpic);
    return albumWrap;
}
 
Example 4
Source File: XHTMLDocumentHandler.java    From docx4j-template with Apache License 2.0 5 votes vote down vote up
/**
 * Jsoup.parse(String url, int timeoutMillis)
 * Jsoup.connect(String url) 方法创建一个新的 Connection, 和  post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误,便会抛出 IOException,应适当处理。
 * 这两个方法只支持Web URLs (http和https 协议); 
 */
@Override
public Document handle(URL url) throws IOException{
	//获取Jsoup参数
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
	//fetch the specified URL and parse to a HTML DOM
	Document doc = Jsoup.parse(url,timeout);
	doc.setBaseUri(baseUri);
	//返回Document对象
	return doc;
}
 
Example 5
Source File: HtmlBot.java    From ContentExtractor with GNU General Public License v2.0 5 votes vote down vote up
public static DomPage getDomPageByHtml(String html,String url){

        Document doc= Jsoup.parse(html);
        if(url!=null){
            doc.setBaseUri(url);
        }
        DomPage domPage=new DomPage(doc);
        return domPage;
    }
 
Example 6
Source File: HtmlBot.java    From WordCount with GNU General Public License v2.0 5 votes vote down vote up
public static DomPage getDomPageByHtml(String html,String url){

        Document doc= Jsoup.parse(html);
        if(url!=null){
            doc.setBaseUri(url);
        }
        DomPage domPage=new DomPage(doc);
        return domPage;
    }
 
Example 7
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 5 votes vote down vote up
/**
 * 解析引用回复的准备数据
 *
 * @param responseBody
 * @return
 */
public static PrepareQuoteReply parsePrepareQuoteReply(String responseBody) {
    PrepareQuoteReply quoteReply = new PrepareQuoteReply();
    try {

        Document document = Jsoup.parse(responseBody);
        document.setBaseUri(Constants.BASE_URL);

        Element postform = document.getElementById("postform");
        String url = postform.absUrl("action");

        String formhash = postform.getElementsByAttributeValue("name", "formhash").first().attr("value");
        String posttime = postform.getElementsByAttributeValue("name", "posttime").first().attr("value");
        String noticeauthor = postform.getElementsByAttributeValue("name", "noticeauthor").first().attr("value");
        String noticetrimstr = postform.getElementsByAttributeValue("name", "noticetrimstr").first().attr("value");
        String noticeauthormsg = postform.getElementsByAttributeValue("name", "noticeauthormsg").first().attr("value");
        String reppid = postform.getElementsByAttributeValue("name", "reppid").first().attr("value");
        String reppost = postform.getElementsByAttributeValue("name", "reppost").first().attr("value");
        String quoteBody = postform.getElementsByTag("blockquote").first().toString();

        quoteReply.setNoticeauthor(noticeauthor);
        quoteReply.setNoticeauthormsg(noticeauthormsg);
        quoteReply.setNoticetrimstr(noticetrimstr);
        quoteReply.setPosttime(posttime);
        quoteReply.setQuoteBody(quoteBody);
        quoteReply.setReppid(reppid);
        quoteReply.setUrl(url);
        quoteReply.setFormhash(formhash);
        quoteReply.setReppost(reppost);
    } catch (Exception e) {
        e.printStackTrace();
    }

    return quoteReply;
}
 
Example 8
Source File: AbstractJsoupExtractor.java    From wandora with GNU General Public License v3.0 5 votes vote down vote up
@Override
public boolean _extractTopicsFrom(File f, TopicMap t) throws Exception {
    if(f.isDirectory()) 
        throw new Exception("Directories are not supported.");
    
    Document d = Jsoup.parse(f,"UTF-8");
    d.setBaseUri(f.getAbsolutePath());
    
    return extractTopicsFrom(d, f.getAbsolutePath(), t);
}
 
Example 9
Source File: Item.java    From KaellyBot with GNU General Public License v3.0 4 votes vote down vote up
public static Item getItem(Language lg, String url) throws IOException {
    Document doc = JSoupManager.getDocument(url);
    doc.setBaseUri(url);
    String name = doc.getElementsByClass("ak-return-link").first().text();
    String level = doc.getElementsByClass("ak-encyclo-detail-level").first().text()
            .replaceAll(Translator.getLabel(lg, "item.extract.level") + " ", "");
    String type = doc.getElementsByClass("ak-encyclo-detail-type").last().children().last().text();

    String skinURL = doc.getElementsByClass("ak-encyclo-detail-illu").first()
            .getElementsByTag("img").first().attr("src");

    String description = null;
    String effects = null;
    String caracteristics = null;
    String conditions = null;
    String set = null;
    String setURL = null;
    String recipe = null;

    Elements titles = doc.getElementsByClass("ak-panel-title");
    Elements lines;
    StringBuilder tmp;
    for (Element title : titles)
        if (title.text().equals(Translator.getLabel(lg, "item.extract.description")))
            description = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.effets")))
            effects = extractStatsFromTitle(lg, title);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.caracteristiques")))
            caracteristics = extractLinesFromTitle(title);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.evolution_effects")))
            effects = extractEvolutionEffectsFromTitle(lg, url);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.conditions")))
            conditions = extractLinesFromTitle(title);
        else if (title.text().contains(Translator.getLabel(lg, "item.extract.panoplie"))) {
            set = title.getElementsByTag("a").first().text();
            setURL = title.getElementsByTag("a").first().attr("abs:href");
        } else if (title.text().equals(Translator.getLabel(lg, "item.extract.recette"))) {
            lines = title.parent().getElementsByClass("ak-column");
            tmp = new StringBuilder();
            for (Element line : lines)
                tmp.append(line.getElementsByClass("ak-front").text()).append(" [")
                        .append(line.getElementsByClass("ak-title").first().text()).append("](")
                        .append(line.getElementsByClass("ak-title").first()
                                .children().first().attr("abs:href")).append(")\n");
            recipe = tmp.toString();
        }

    return new Item(name, type, level, description, effects, URLManager.abs(skinURL), url,
            caracteristics, conditions, set, setURL, recipe);
}
 
Example 10
Source File: Resource.java    From KaellyBot with GNU General Public License v3.0 4 votes vote down vote up
public static Resource getResource(Language lg, String url) throws IOException {
    Document doc = JSoupManager.getDocument(url);
    doc.setBaseUri(url);
    String name = doc.getElementsByClass("ak-return-link").first().text();
    String level = null;
    if (! doc.getElementsByClass("ak-encyclo-detail-level").isEmpty())
        level = doc.getElementsByClass("ak-encyclo-detail-level").first().text()
            .replaceAll(Translator.getLabel(lg, "resource.extract.level") + " ", "");
    String type = doc.getElementsByClass("ak-encyclo-detail-type").last().children().last().text();

    String skinURL = doc.getElementsByClass("ak-encyclo-detail-illu").first()
            .getElementsByTag("img").first().attr("src");

    String description = null;
    String effects = null;
    String bonus = null;
    String sorts = null;
    String recipe = null;
    List<String> monsterDrops = new ArrayList<>();

    Elements titles = doc.getElementsByClass("ak-panel-title");
    Elements lines;
    StringBuilder tmp;
    for (Element title : titles)
        if (title.text().equals(Translator.getLabel(lg, "resource.extract.description")))
            description = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.effets")))
            effects = extractStatsFromTitle(lg, title);
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.bonus")))
            bonus = extractLinesFromTitle(title);
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.sorts")))
            sorts = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.monsterDrop")))
            monsterDrops = extractDrops(title.parent());
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.recette"))){
            lines = title.parent().getElementsByClass("ak-column");
            tmp = new StringBuilder();
            for (Element line : lines)
                tmp.append(line.getElementsByClass("ak-front").text()).append(" [")
                        .append(line.getElementsByClass("ak-title").first().text()).append("](")
                        .append(line.getElementsByClass("ak-title").first()
                                .children().first().attr("abs:href")).append(")\n");
            recipe = tmp.toString();
        }

    return new Resource(name, type, level, description, effects, URLManager.abs(skinURL), url,
            bonus, sorts, recipe, monsterDrops);
}
 
Example 11
Source File: UntisInfoHeadlessParser.java    From substitution-schedule-parser with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public SubstitutionSchedule getSubstitutionSchedule()
		throws IOException, JSONException, CredentialInvalidException {
	new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

	SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

	Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null)));
       doc.setBaseUri(url);
       Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b");

       Elements frames = doc.select("frame[src*=w00]");
       if (dayElems.size() == 0 && frames.size() > 0) {
           // doc is embedded in frame
           doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null)));
           dayElems = doc.select("#vertretung > p > b, #vertretung > b");
       } else if (dayElems.size() == 0) {
           // seen at GHS Berlin, different kinds of center > font > center ... stacked (sometimes within #vertretung)
           dayElems = doc.select("center > font > p > b");
       }

       final List<String> allClasses = getAllClasses();
       if (dayElems.size() > 0) {
           // untis-info days
           for (Element dayElem : dayElems) {
               SubstitutionScheduleDay day = new SubstitutionScheduleDay();
               day.setLastChangeString("");

               String date = dayElem.text();
               day.setDateString(date);
               day.setDate(ParserUtils.parseDate(date));

               Element next;
               if (dayElem.parent().tagName().equals("p")) {
                   next = dayElem.parent().nextElementSibling().nextElementSibling();
               } else {
                   next = dayElem.parent().select("p").first().nextElementSibling();
               }
               parseDay(day, next, v, null, allClasses);
           }
       } else if (doc.select("tr:has(td[align=center]):gt(0)").size() > 0) {
           // untis-subst table
           parseSubstitutionTable(v, null, doc);
       }

       v.setClasses(allClasses);
       v.setTeachers(getAllTeachers());
	return v;
}
 
Example 12
Source File: HtmlParse.java    From ChipHellClient with Apache License 2.0 4 votes vote down vote up
/**
 * 解析板块列表
 *
 * @param content
 * @return
 */
public static List<PlateGroup> parsePlateGroupList(String content) {
    List<PlateGroup> groups = new ArrayList<PlateGroup>();
    Document document = Jsoup.parse(content);
    document.setBaseUri(Constants.BASE_URL);
    Elements elementsGroup = document.getElementsByClass("bm");
    for (Element bm : elementsGroup) {
        PlateGroup plateGroup = new PlateGroup();

        Element bm_h = bm.getElementsByClass("bm_h").first();
        String title = bm_h.text();
        plateGroup.setTitle(title);
        List<Plate> plates = new ArrayList<Plate>();
        Elements plateElements = bm.getElementsByClass("bm_c");

        for (Element bm_c : plateElements) {
            Plate plate = new Plate();
            //链接,第一个是版块链接,如果有第二个则是删除收藏连接
            Elements as = bm_c.getElementsByTag("a");
            Element a1 = as.first();
            String plateTitle = a1.text();
            String url = a1.absUrl("href");
            Elements count = bm_c.getElementsByClass("xg1");
            String xg1 = null;
            if (count.size() != 0) {
                xg1 = count.first().text();
            } else {
                xg1 = "(0)";
            }

            //判断是否收藏
            String favoriteId = null;
            if (as.size() > 1) {
                String urlDelete = as.get(1).absUrl("href");
                favoriteId = new UrlParamsMap(urlDelete).get("favid");
            }

            plate.setTitle(plateTitle);
            plate.setUrl(url);
            plate.setXg1(xg1);
            plate.setFavoriteId(favoriteId);
            plates.add(plate);

        }

        plateGroup.setPlates(plates);
        groups.add(plateGroup);
    }

    return groups;
}