Java Code Examples for org.jsoup.select.Elements#text()

The following examples show how to use org.jsoup.select.Elements#text() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Meizi4493.java    From PicKing with Apache License 2.0 6 votes vote down vote up
@Override
public String getDetailNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException {
    Document document = Jsoup.parse(new String(result, "gb2312"));
    Elements elements = document.select("div.page a#nextpage");
    Elements elements1 = document.select("div.page a.current");
    if (elements.size() > 0 && elements1.size() > 0) {
        String current = elements1.text();
        Pattern pattern = Pattern.compile("http.*/");
        Matcher matcher = pattern.matcher(currentUrl);
        if (matcher.find()) {
            String href = elements.get(0).attr("href");
            Pattern pattern2 = Pattern.compile("[1-9]\\d*");
            Matcher matcher2 = pattern2.matcher(href);
            if (matcher2.find()) {
                String next = matcher2.group();
                if (current.equals(next))
                    return "";
                return matcher.group() + next + ".htm";
            }
        }
    }
    return "";
}
 
Example 2
Source File: Book.java    From nju-lib-downloader with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 通过在线阅读页面补全{@code Book}的信息
 * 仅可补全{@link #name},{@link #id},{@link #author},{@link  #publishDate}
 *
 * @param url 书本的在线阅读页面
 */
public void fillBookInfoByUrl(String url) {
    try {
        String html = new BookDownloader(this).getBookViewPageHtml(url);
        html = html.replaceAll("<!--", "<");
        html = html.replaceAll("-->", "");
        Document doc = Jsoup.parse(html);
        Elements nameNode = doc.getElementsByTag("title");
        this.name = nameNode.text();
        Elements infoNode = doc.getElementsByTag("span").not("[style]");
        for (Element node : infoNode) {
            if (node.text().startsWith("作者:")) {
                this.author = node.text().substring(3, node.text().length());
            }
            if (node.text().startsWith("出版日期:")) {
                this.publishDate = node.text().substring(5, node.text().length());
            }
        }
    } catch (BookDLException e) {
        e.printStackTrace();
    }
}
 
Example 3
Source File: ParseV9PronVideo.java    From v9porn with MIT License 5 votes vote down vote up
/**
 * 解析错误提示
 *
 * @param html html
 * @return 错误洗洗脑
 */
public static String parseErrorInfo(String html) {
    String errorInfo = "";
    Document doc = Jsoup.parse(html);
    Elements errorElements = doc.select("div.errorbox");
    if (errorElements != null) {
        errorInfo = errorElements.text();
    }
    return errorInfo;
}
 
Example 4
Source File: VersionChecker.java    From mobikul-standalone-pos with MIT License 5 votes vote down vote up
@Override
    protected String doInBackground(String... params) {
        try {
            Document doc = Jsoup.connect("https://play.google.com/store/apps/details?id=" + BuildConfig.APPLICATION_ID).get();
            Elements element = doc.getElementsByAttributeValue("itemprop", "softwareVersion");
//                    .userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6")
//                    .referrer("http://www.google.com")
            if (element != null && !element.text().isEmpty())
                newVersion = element.text();
        } catch (IOException e) {
            e.printStackTrace();
        }
        Log.d(TAG, "doInBackground: " + newVersion);
        return newVersion;
    }
 
Example 5
Source File: ParseV9PronVideo.java    From v9porn with MIT License 5 votes vote down vote up
/**
 * 解析错误提示
 *
 * @param html html
 * @return 错误洗洗脑
 */
public static String parseErrorInfo(String html) {
    String errorInfo = "";
    Document doc = Jsoup.parse(html);
    Elements errorElements = doc.select("div.errorbox");
    if (errorElements != null) {
        errorInfo = errorElements.text();
    }
    return errorInfo;
}
 
Example 6
Source File: Test2.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
    public void test() throws IOException {
        //Document document = SpiderUtil.getDocument("http://jb39.com/jibing/FengShiXingDuoJiTong269871.htm");
        Document document = Jsoup.parse(new File("D:\\IDEA\\workSpace\\robots\\robots\\src\\main\\resources\\demo.html"), "gb2312", "http://jb39.com");
//        Elements select = document.select(".ul-ss-3").select(".jb-xx-bw");
        Elements select = document.getElementsByClass("ul-ss-3 jb-xx-bw");
        if (select.size()==0){
            System.out.println(0);
            System.out.println(document);
        }else {
            String text = select.text();
            System.out.println(text);
        }
    }
 
Example 7
Source File: AgentStatusReportExecutorTest.java    From docker-swarm-elastic-agent-plugin with Apache License 2.0 5 votes vote down vote up
private void assertServiceDetails(Service service, Document document) {
    final Elements serviceDetails = document.select(".tab-content").attr("ng-show", "currenttab == 'service-details'");
    final String serviceDetailsText = serviceDetails.text();

    assertThat(serviceDetailsText, containsString(service.id()));
    assertThat(serviceDetailsText, containsString(service.spec().name()));
    assertThat(serviceDetailsText, containsString(service.spec().taskTemplate().containerSpec().image()));
}
 
Example 8
Source File: ParseV9PronVideo.java    From v9porn with MIT License 4 votes vote down vote up
/**
 * 解析我的收藏
 *
 * @param html html
 * @return list
 */
public static BaseResult<List<V9PornItem>> parseMyFavorite(String html) {
    int totalPage = 1;
    List<V9PornItem> v9PornItemList = new ArrayList<>();
    Document doc = Jsoup.parse(html);
    //Element body = doc.getElementById("leftside");

    Elements videos = doc.select("div[class=col-xs-12 col-sm-4 col-md-3 col-lg-3]");

    for (Element element : videos) {

        V9PornItem v9PornItem = new V9PornItem();

        String contentUrl = element.select("a").first().attr("href");

        String viewKey = contentUrl.substring(contentUrl.indexOf("viewkey") + 8, contentUrl.length());
        v9PornItem.setViewKey(viewKey);
        Logger.t(TAG).d(viewKey);

        String title = element.select("span[class=video-title title-truncate m-t-5]").first().text();
        v9PornItem.setTitle(title);
        Logger.t(TAG).d(title);

        String imgUrl = element.select("img").first().attr("src");
        v9PornItem.setImgUrl(imgUrl);
        Logger.t(TAG).d(imgUrl);

        String allInfo = element.text();
        Logger.t(TAG).d(allInfo);

        String duration = allInfo.substring(allInfo.indexOf("时长") + 3, allInfo.indexOf("查看"));
        v9PornItem.setDuration(duration);
        Logger.t(TAG).d(duration);

        String info = allInfo.substring(allInfo.indexOf("添加时间")+5, allInfo.indexOf("时长"));
        v9PornItem.setInfo(info);
        Logger.t(TAG).d(info);

        String rvid = element.select("input").first().attr("value");
        Logger.t(TAG).d("rvid::" + rvid);
        VideoResult videoResult = new VideoResult();
        videoResult.setId(VideoResult.OUT_OF_WATCH_TIMES);
        videoResult.setVideoId(rvid);
        v9PornItem.setVideoResult(videoResult);

        v9PornItemList.add(v9PornItem);
    }

    //总页数
    //Element pagingnav = body.getElementById("paging");
    Elements a = doc.select("div[class=pagingnav]");
    if (a.size() >= 2) {
        String ppp = a.get(a.size() - 2).text();
        if (TextUtils.isDigitsOnly(ppp)) {
            totalPage = Integer.parseInt(ppp);
            Logger.d("总页数:" + totalPage);
        }
    }
    BaseResult<List<V9PornItem>> baseResult = new BaseResult<>();
    //尝试解析删除信息
    Elements msgElements = doc.select("div.msgbox");
    if (msgElements != null) {
        String msgInfo = msgElements.text();
        if (!TextUtils.isEmpty(msgInfo)) {
            baseResult.setCode(BaseResult.SUCCESS_CODE);
            baseResult.setMessage(msgInfo);
        }
    } else {
        String errorMsg = parseErrorInfo(html);
        if (!TextUtils.isEmpty(errorMsg)) {
            baseResult.setMessage(errorMsg);
            baseResult.setCode(BaseResult.ERROR_CODE);
        }
    }

    baseResult.setTotalPage(totalPage);
    baseResult.setData(v9PornItemList);

    return baseResult;
}
 
Example 9
Source File: ImgurRipper.java    From ripme with MIT License 4 votes vote down vote up
public String getAlbumTitle(URL url) throws MalformedURLException {
    String gid = getGID(url);
    if (this.albumType == ALBUM_TYPE.ALBUM) {
        try {
            // Attempt to use album title as GID
            if (albumDoc == null) {
                albumDoc = Http.url(url).get();
            }

            Elements elems = null;

            /*
            // TODO: Add config option for including username in album title.
            // It's possible a lot of users would not be interested in that info.
            String user = null;
            elems = albumDoc.select(".post-account");
            if (elems.size() > 0) {
                Element postAccount = elems.get(0);
                if (postAccount != null) {
                    user = postAccount.text();
                }
            }
            */

            String title = null;
            final String defaultTitle1 = "Imgur: The most awesome images on the Internet";
            final String defaultTitle2 = "Imgur: The magic of the Internet";
            LOGGER.info("Trying to get album title");
            elems = albumDoc.select("meta[property=og:title]");
            if (elems != null) {
                title = elems.attr("content");
                LOGGER.debug("Title is " + title);
            }
            // This is here encase the album is unnamed, to prevent
            // Imgur: The most awesome images on the Internet from being added onto the album name
            if (title.contains(defaultTitle1) || title.contains(defaultTitle2)) {
                LOGGER.debug("Album is untitled or imgur is returning the default title");
                // We set the title to "" here because if it's found in the next few attempts it will be changed
                // but if it's nto found there will be no reason to set it later
                title = "";
                LOGGER.debug("Trying to use title tag to get title");
                elems = albumDoc.select("title");
                if (elems != null) {
                    if (elems.text().contains(defaultTitle1) || elems.text().contains(defaultTitle2)) {
                        LOGGER.debug("Was unable to get album title or album was untitled");
                    }
                    else {
                        title = elems.text();
                    }
                }
            }

            String albumTitle = "imgur_";
            /*
            // TODO: Add config option (see above)
            if (user != null) {
                albumTitle += "user_" + user;
            }
            */
            albumTitle += gid;
            if (title != null) {
                albumTitle += "_" + title;
            }

            return albumTitle;
        } catch (IOException e) {
            // Fall back to default album naming convention
        }
    }
    return getHost() + "_" + gid;
}
 
Example 10
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = this.link(bookClasses);

        //获取作者,出版日期,主题词,分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息,将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}
 
Example 11
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = new RootBookClass().link(bookClasses);

        //获取作者,出版日期,主题词,分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息,将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}
 
Example 12
Source File: Book.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
public static List<Book> getBookFromHTML(String html) {
        Document doc = Jsoup.parse(html);
        Elements infoNode = doc.select("div[class=boxListLi5]");
        List<Book> books = new ArrayList<>(30);
        if (infoNode != null) {
            for (int i = 0; i < infoNode.size(); i++) {
                String id = null, name = null, author = null, publishDate = null, press = null, introduction = null, coverUrl = null;
                Elements idNameNode = infoNode.get(i).select("a[href][title]");
                if (idNameNode != null && idNameNode.size() > 0) {
                    Elements coverImageNode = infoNode.get(i).select("img[src]");
                    if (coverImageNode != null && coverImageNode.size() > 0) {
                        coverUrl = coverImageNode.attr("src");
                    }
                    name = idNameNode.get(0).attr("title");
                    id = idNameNode.get(0).attr("href");
                    int id_index = id.indexOf("/book/detail/");
                    if (id_index != -1) {
                        id = id.substring(id_index + "/book/detail/".length(), id.length());
                    }
                    Elements pressNode = infoNode.get(i).select("span");
                    if (pressNode != null && pressNode.size() > 0) {
                        String pressInfo = pressNode.get(0).text();
                        if (pressInfo != null) {
                            String[] pressInfoArray = pressInfo.split("/");
                            if (pressInfoArray != null && pressInfoArray.length == 3) {
                                author = pressInfoArray[0].trim();
                                press = pressInfoArray[1].trim();
                                publishDate = pressInfoArray[2].trim();
                            }
                        }
                    }
                    Elements introNode = infoNode.get(i).select("p");
                    if (introNode != null && introNode.size() > 0) {
                        introduction = introNode.text();
                    }
                }
                if (id != null) {
                    Book book = new Book(id, name, press, author, publishDate, introduction, coverUrl);
                    books.add(book);
//                    System.out.println(book);
                }
            }
        }
        return books;
    }
 
Example 13
Source File: JSoupUtils.java    From ZhiHu-TopAnswer with Apache License 2.0 4 votes vote down vote up
/**
 * 首页获取列表信息
 * @param document
 * @return
 */
public static List<TopicAnswers> getTopicList(Document document) {
    Elements contentLinks = document.select("div.content");
    List<TopicAnswers> list = new ArrayList<>();
    Iterator iterator = contentLinks.iterator();
    while (iterator.hasNext()) {
        TopicAnswers answers = new TopicAnswers();
        Element body = (Element) iterator.next();
        Elements questionLinks = body.select("a.question_link");
        if (questionLinks.iterator().hasNext()) {
            Element questionLink = questionLinks.iterator().next();
            answers.setTitle(questionLink.text());
            answers.setUrl("https://www.zhihu.com" + questionLink.attr("href"));
        }


        Elements votes = body.select("a.zm-item-vote-count.js-expand.js-vote-count");
        if (votes.size() > 0) {
            if (votes.iterator().hasNext()) {
                Element aVotes = votes.iterator().next();
                answers.setVote(aVotes.text());
            }
        }

        Elements divs = body.select("div.zh-summary.summary.clearfix");

        String descBody = divs.text();
        if (descBody.length() > 4) {
            descBody = descBody.substring(0, descBody.length() - 4);
        }
        answers.setBody(descBody);
        if (divs.size() > 0) {
            if (divs.iterator().hasNext()) {
                Element aDiv = divs.iterator().next();

                Element img = aDiv.children().first();
                if (img.tagName().equals("img")) {
                    String imgUrl = img.attr("src");
                    answers.setImg(imgUrl);
                }
            }
        }
        if (!TextUtils.isEmpty(answers.getTitle()) && !TextUtils.isEmpty(answers.getUrl())) {
            L.i(TAG, answers.toString());
            list.add(answers);
        }
    }
    return list;
}