Java Code Examples for org.jsoup.nodes.Element#toString()

The following examples show how to use org.jsoup.nodes.Element#toString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JsoupUtil.java    From xxl-crawler with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 抽取元素数据
 *
 * @param fieldElement
 * @param selectType
 * @param selectVal
 * @return String
 */
public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) {
    String fieldElementOrigin = null;
    if (XxlCrawlerConf.SelectType.HTML == selectType) {
        fieldElementOrigin = fieldElement.html();
    } else if (XxlCrawlerConf.SelectType.VAL == selectType) {
        fieldElementOrigin = fieldElement.val();
    } else if (XxlCrawlerConf.SelectType.TEXT == selectType) {
        fieldElementOrigin = fieldElement.text();
    } else if (XxlCrawlerConf.SelectType.ATTR == selectType) {
        fieldElementOrigin = fieldElement.attr(selectVal);
    }  else if (XxlCrawlerConf.SelectType.HAS_CLASS == selectType) {
        fieldElementOrigin = String.valueOf(fieldElement.hasClass(selectVal));
    }  else {
        fieldElementOrigin = fieldElement.toString();
    }
    return fieldElementOrigin;
}
 
Example 2
Source File: HbrowseParser.java    From Hentoid with Apache License 2.0 6 votes vote down vote up
public static List<String> parseImages(@NonNull Content content, @NonNull List<Element> scripts) {
    content.populateUniqueSiteId();
    List<String> result = new ArrayList<>();

    String chapter = "";
    String[] parts = content.getUrl().split("/");
    if (parts.length > 1) chapter = parts[1];

    for (Element e : scripts) {
        String scriptContent = e.toString();
        if (scriptContent.contains("list")) {
            int beginIndex = scriptContent.indexOf("list = [") + 8;
            String[] list = scriptContent.substring(beginIndex, scriptContent.indexOf("];", beginIndex)).replace("\"", "").split(",");
            for (String s : list) {
                if (!s.trim().isEmpty() && !s.equalsIgnoreCase("zzz")) {
                    String imgUrl = Site.HBROWSE.getUrl() + "data/" + content.getUniqueSiteId() + "/" + chapter + "/" + s;
                    result.add(imgUrl);
                }
            }
            break;
        }
    }

    return result;
}
 
Example 3
Source File: Parse99Mm.java    From v9porn with MIT License 5 votes vote down vote up
public static List<String> parse99MmImageList(String html) {

        Document doc = Jsoup.parse(html);

        Element elementBox = doc.getElementById("picbox");
        String imgUrl = elementBox.selectFirst("img").attr("src").trim();
        HttpUrl httpUrl = HttpUrl.parse(imgUrl);
        Element element = doc.body().select("script").first();
        String javaScript = element.toString();
        String data = StringUtils.subString(javaScript, javaScript.indexOf("[") + 1, javaScript.lastIndexOf(";") - 1);
        String[] dataArray = data.replace("\"", "").split(",");

        int imgIdArrayLength = dataArray.length - 6;

        String[] imgIdArray = new String[imgIdArrayLength];
        System.arraycopy(dataArray, 6, imgIdArray, 0, imgIdArrayLength);
        Logger.t(TAG).d(dataArray);
        Logger.t(TAG).d(imgIdArray);

        List<String> stringImageList = new ArrayList<>();
        String host;
        if (httpUrl == null) {
            host = "http://fj.kanmengmei.com/";
        } else {
            host = httpUrl.scheme() +"://"+ httpUrl.host();
        }

        for (int i = 0; i < imgIdArrayLength; i++) {
            String tmpImgUrl = host + "/" + dataArray[1] + (i + 1) + "-" + imgIdArray[i] + ".jpg";
            Logger.t(TAG).d(tmpImgUrl);
            stringImageList.add(tmpImgUrl);
        }
        return stringImageList;

    }
 
Example 4
Source File: Parse99Mm.java    From v9porn with MIT License 5 votes vote down vote up
public static List<String> parse99MmImageList(String html) {

        Document doc = Jsoup.parse(html);

        Element elementBox = doc.getElementById("picbox");
        String imgUrl = elementBox.selectFirst("img").attr("src").trim();
        HttpUrl httpUrl = HttpUrl.parse(imgUrl);
        Element element = doc.body().select("script").first();
        String javaScript = element.toString();
        String data = StringUtils.subString(javaScript, javaScript.indexOf("[") + 1, javaScript.lastIndexOf(";") - 1);
        String[] dataArray = data.replace("\"", "").split(",");

        int imgIdArrayLength = dataArray.length - 6;

        String[] imgIdArray = new String[imgIdArrayLength];
        System.arraycopy(dataArray, 6, imgIdArray, 0, imgIdArrayLength);
        Logger.t(TAG).d(dataArray);
        Logger.t(TAG).d(imgIdArray);

        List<String> stringImageList = new ArrayList<>();
        String host;
        if (httpUrl == null) {
            host = "http://fj.kanmengmei.com/";
        } else {
            host = httpUrl.scheme() +"://"+ httpUrl.host();
        }

        for (int i = 0; i < imgIdArrayLength; i++) {
            String tmpImgUrl = host + "/" + dataArray[1] + (i + 1) + "-" + imgIdArray[i] + ".jpg";
            Logger.t(TAG).d(tmpImgUrl);
            stringImageList.add(tmpImgUrl);
        }
        return stringImageList;

    }
 
Example 5
Source File: VideoUrlParser.java    From v9porn with MIT License 5 votes vote down vote up
@Override
    public VideoResult parseVideoPlayUrl(String html, User user) {
        VideoResult videoResult = new VideoResult();
        //html= DevHtmlTools.getLocalHtml(MyApplication.getInstance(),"videourl.txt");
        Document document = Jsoup.parse(html);
        Element htmlTag=document.select("html").first();
        if(htmlTag!=null){
            String htmlString=htmlTag.toString();
        }
        Element element = document.getElementById("player_one");

        String imgUrl=element.attr("poster");
        String videoId= imgUrl.substring(imgUrl.indexOf("thumb")+6,imgUrl.lastIndexOf("."));
        videoResult.setVideoId(videoId);
        Logger.t(TAG).d("视频Id:" + videoId);

        Element jsElement=element.select("script").first();
        String jsTagString=jsElement.toString();
        String jsScriptVideoUrl=jsTagString.substring(jsTagString.indexOf("strencode"),jsTagString.indexOf(");"));

        /**
         * element.select("script").toString().substring(element.select("script").toString().indexOf("strencode"),element.select("script").toString().indexOf(");"))
         */

        videoResult.setVideoUrl(jsScriptVideoUrl);
//        String videoUrl = element.selectFirst("source").attr("src");
//        videoResult.setVideoUrl(videoUrl);
//        int startIndex = videoUrl.lastIndexOf("/");
//        int endIndex = videoUrl.indexOf(".mp4");
//        String videoId = videoUrl.substring(startIndex + 1, endIndex);
//        videoResult.setVideoId(videoId);
//        Logger.t(TAG).d("视频Id:" + videoId);
        parserOtherInfo(document, videoResult, user);
        return videoResult;
    }
 
Example 6
Source File: HtmlUtil.java    From V2EX with GNU General Public License v3.0 5 votes vote down vote up
public static Topic getTopicAndReplies(String html){

        Topic topic = new Topic();
        Document document = Jsoup.parse(html);
        Element header = document.selectFirst("#Main > .box");
        String headerHtml = header.toString();
        Element middleEle = document.selectFirst("#Main > .box > .cell > span");
        Element contentEle = header.selectFirst(".topic_content");
        Element subtleEle = header.selectFirst(".subtle");
        String publishedTime = document.selectFirst("meta[property=article:published_time]")
                .attr("content")
                .replaceAll("[TZ]", " ");

        topic.setCreated(TimeUtil.strToTimestamp(publishedTime,null));
        topic.setId(matcherGroup1Int(Pattern.compile("(\\d{2,})"),
                document.selectFirst("meta[property=og:url]").attr("content")));
        topic.setTitle(header.selectFirst(".header > h1").text());
        topic.setClicks(matcherGroup1Int(PATTERN_TOPIC_CLICK, headerHtml));
        topic.setAgo(matcherGroup1(Pattern.compile("· ([^·]+) ·"),
                header.selectFirst(".header > small").toString()));
        topic.setFavors(matcherGroup1Int(PATTERN_TOPIC_FAVORS, headerHtml));
        topic.setContent_rendered("\n"
                + (contentEle == null ? "<br>" : contentEle.toString())
                + (subtleEle == null ? " " : subtleEle.toString())
                + "\n\t---");
        topic.setMember(new Member(
                matcherGroup1(PATTERN_TOPIC_USERNAME, headerHtml),
                matcherGroup1(PATTERN_TOPIC_USER_AVATAR, headerHtml)));
        topic.setNode(new Node(
                document.selectFirst("meta[property=article:tag]").attr("content"),
                document.selectFirst("meta[property=article:section]").attr("content")));

        if (middleEle != null){
            String lastTouched = matcherGroup1(Pattern.compile("直到 ([^+]+)"), middleEle.toString());
            topic.setLast_touched(lastTouched.isEmpty() ? 0 : TimeUtil.strToTimestamp(lastTouched,null));
            topic.setReplies(matcherGroup1Int(PATTERN_TOPIC_REPLY_COUNT, middleEle.toString()));
        }
        topic.setReplyList(getReplies(document, topic.getMember().getUsername()));
        return topic;
    }
 
Example 7
Source File: HtmlUtil.java    From V2EX with GNU General Public License v3.0 5 votes vote down vote up
private static List<Reply> getReplies(Document document, String poster){

        Elements elements = document.select("#Main > .box > .cell[id]");
        Iterator<Element> elementIterator = elements.iterator();

        List<Reply> replies = new ArrayList<>(elements.size());
        for (int f=0; elementIterator.hasNext(); f++) {
            Element e = elementIterator.next();
            Reply reply = new Reply();

            Element element = e.selectFirst(".reply_content");
            if (element != null){
                for (Element img:element.select("img")){
                    img.attr("width","100%");
                    img.attr("height","auto");
                }
                reply.setContent(element.html());
            }else{
                throw new V2exException("This post seems to have been blocked\nEmpty reply content");
            }

            String cell = e.toString();
            int id = matcherGroup1Int(PATTERN_REPLY_ID, cell);
            String username = matcherGroup1(PATTERN_REPLY_USERNAME, cell);
            String avatarNormal = matcherGroup1(PATTERN_REPLY_AVATAR, cell);

            reply.setId(id);
            reply.setMember(new Member(username, avatarNormal));
            if (poster != null) reply.setPoster(username.equals(poster));
            reply.setAgo(matcherGroup1(PATTERN_REPLY_AGO, cell));
            reply.setVia(matcherGroup1(PATTERN_REPLY_VIA, cell));
            reply.setLike(matcherGroup1Int(PATTERN_REPLY_LIKE, cell));
            reply.setFloor(f);

            replies.add(reply);
        }
        return replies;
    }
 
Example 8
Source File: Handian.java    From ankihelper with GNU General Public License v3.0 5 votes vote down vote up
public List<Definition> wordLookup(String key) {
        try {
//            Document doc = Jsoup.connect(wordUrl + key)
//                    .userAgent(DEFAULT_UA)
//                    .timeout(5000)
//                    .get();
//            String html = doc.toString();
            Request request = new Request.Builder().url(wordUrl + key)
                    .header("User-Agent", Constant.UA)
                    .build();
            String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
            Document doc = Jsoup.parse(rawhtml);
            Elements entrys = doc.select("div.cdnr, div.tagContent");
            ArrayList<Definition> defList = new ArrayList<>();
            if (entrys.size() > 0) {
                    Element ele = entrys.get(0);
                    String word = key;
                    String meaning = ele.toString();
                    meaning = meaning.replaceAll("<img src=\"/", "<img src=\"http://www.zdic.net/");
                    meaning = meaning.replaceAll("&amp;","&");
                    HashMap<String, String> defMap = new HashMap<>();
                    String definition = meaning;
                    defMap.put(EXP_ELE[0], word);
                    defMap.put(EXP_ELE[1], definition);
                    defList.add(new Definition(defMap, definition));
            }
            return defList;
        } catch (IOException ioe) {
            Log.d("time out", Log.getStackTraceString(ioe));
            //Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show();
            return new ArrayList<Definition>();
        }

    }
 
Example 9
Source File: WendaDetailPresenter.java    From Toutiao with Apache License 2.0 5 votes vote down vote up
private String getHTML(String response) {
    Document doc = Jsoup.parse(response, "UTF-8");
    Elements elements = doc.getElementsByClass("con-words");
    String content = null;
    for (Element element : elements) {
        content = element.toString();
        break;
    }
    if (content != null) {

        String css = "<link rel=\"stylesheet\" href=\"file:///android_asset/toutiao_light.css\" type=\"text/css\">";
        if (SettingUtil.getInstance().getIsNightMode()) {
            css = css.replace("toutiao_light", "toutiao_dark");
        }

        String html = "<!DOCTYPE html>\n" +
                "<html lang=\"en\">\n" +
                "<head>\n" +
                "    <meta charset=\"UTF-8\">" +
                css +
                "<body>\n" +
                "<article class=\"article-container\">\n" +
                "    <div class=\"article__content article-content\">" +
                content +
                "    </div>\n" +
                "</article>\n" +
                "</body>\n" +
                "</html>";

        return html;
    } else {
        return null;
    }
}
 
Example 10
Source File: PhotoContentPresenter.java    From Toutiao with Apache License 2.0 5 votes vote down vote up
private Boolean parseHTML(String HTML) {
    boolean flag = false;
    Document doc = Jsoup.parse(HTML);
    // 取得所有的script tag
    Elements scripts = doc.getElementsByTag("script");
    for (Element e : scripts) {
        // 过滤字符串
        String script = e.toString();
        if (script.contains("BASE_DATA.galleryInfo")) {
            // 只取得script的內容
            script = e.childNode(0).toString();

            Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
            while (matcher.find()) {
                int count = matcher.groupCount();
                if (count >= 1) {
                    int start = script.indexOf("(");
                    int end = script.indexOf("),");
                    String json = script.substring(start + 2, end - 1);

                    // 处理特殊符号
                    json = ChineseUtil.UnicodeToChs(json);
                    json = json.replace("\\", "");
                    JsonReader reader = new JsonReader(new StringReader(json));
                    reader.setLenient(true);
                    bean = new Gson().fromJson(reader, PhotoGalleryBean.class);
                    Log.d(TAG, "parseHTML: " + bean.toString());
                    flag = true;
                    break;
                }
            }
        }
    }
    return flag;
}
 
Example 11
Source File: LoadTags.java    From NClientV2 with Apache License 2.0 4 votes vote down vote up
private String extractArray(Element e) {
    String t = e.toString();
    return t.substring(t.indexOf('['), t.indexOf(';'));
}
 
Example 12
Source File: AnnouncementListFragment.java    From PKUCourses with GNU General Public License v3.0 4 votes vote down vote up
public String getContents() {
    Element tmp = nNode.getElementsByClass("details").first().getElementsByClass("vtbegenerated").first();
    return tmp == null ? "" : tmp.toString();
}
 
Example 13
Source File: HTMLExtensions.java    From Android-WYSIWYG-Editor with Apache License 2.0 4 votes vote down vote up
public String getHtmlSpan(Element element) {
    Element el = new Element(Tag.valueOf("span"), "");
    el.attributes().put("style", element.attr("style"));
    el.html(element.html());
    return el.toString();
}
 
Example 14
Source File: BootstrapHandlerDependenciesTest.java    From flow with Apache License 2.0 4 votes vote down vote up
@Test
public void flowDependenciesShouldBeImportedBeforeUserDependenciesWithCorrectAttributes() {
    Consumer<Document> uiPageTestingMethod = page -> {
        boolean foundClientEngine = false;
        int flowDependencyMaxIndex = Integer.MAX_VALUE;
        int userDependencyMinIndex = Integer.MAX_VALUE;

        Elements children = page.head().children();
        for (int i = 0; i < children.size(); i++) {
            Element element = children.get(i);
            String elementString = element.toString();
            if (foundClientEngine) {
                if (userDependencyMinIndex > i) {
                    userDependencyMinIndex = i;
                }
                if (elementString.contains("dndConnector.js")) {
                    continue;
                }
                assertThat(
                        "Expected to have here dependencies added with Flow public api",
                        elementString,
                        either(containsString("eager"))
                                .or(containsString("lazy"))
                                .or(containsString("inline")));
            } else {
                flowDependencyMaxIndex = i;
                // skip element with uidl that contains lazy dependencies
                if (!elementString.contains(BOOTSTRAP_SCRIPT_CONTENTS)) {
                    assertThat(
                            "Flow dependencies should not contain user dependencies",
                            elementString,
                            both(not(containsString("eager")))
                                    .and(not(containsString("lazy")))
                                    .and(not(containsString("inline"))));
                    if (elementString.contains(
                            BootstrapHandler.clientEngineFile.get())) {
                        foundClientEngine = true;
                    }
                } else {
                    assertThat(
                            "uidl should not contain eager and inline dependencies",
                            elementString,
                            both(not(containsString("eager")))
                                    .and(not(containsString("inline"))));
                }
            }

            assertThat(String.format(
                    "All javascript dependencies should be loaded without 'async' attribute. Dependency with url %s has this attribute",
                    element.attr("src")), element.attr("async"), is(""));
        }

        assertThat(
                "Flow dependencies should be imported before user dependencies",
                flowDependencyMaxIndex,
                is(lessThan(userDependencyMinIndex)));

    };

    testUis(uiPageTestingMethod, new UIAnnotated_LoadingOrderTest(),
            new UIWithMethods_LoadingOrderTest());
}