Java Code Examples for org.jsoup.nodes.Element#textNodes()

The following examples show how to use org.jsoup.nodes.Element#textNodes() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PageLoaderEpub.java    From a with GNU General Public License v3.0 6 votes vote down vote up
@Override
protected String getChapterContent(BookChapterBean chapter) throws Exception {
    Resource resource = epubBook.getResources().getByHref(chapter.getDurChapterUrl());
    StringBuilder content = new StringBuilder();
    Document doc = Jsoup.parse(new String(resource.getData(), mCharset));
    Elements elements = doc.getAllElements();
    for (Element element : elements) {
        List<TextNode> contentEs = element.textNodes();
        for (int i = 0; i < contentEs.size(); i++) {
            String text = contentEs.get(i).text().trim();
            text = StringUtils.formatHtml(text);
            if (elements.size() > 1) {
                if (text.length() > 0) {
                    if (content.length() > 0) {
                        content.append("\r\n");
                    }
                    content.append("\u3000\u3000").append(text);
                }
            } else {
                content.append(text);
            }
        }
    }
    return content.toString();
}
 
Example 2
Source File: PageLoaderEpub.java    From MyBookshelf with GNU General Public License v3.0 6 votes vote down vote up
@Override
protected String getChapterContent(BookChapterBean chapter) throws Exception {
    Resource resource = epubBook.getResources().getByHref(chapter.getDurChapterUrl());
    StringBuilder content = new StringBuilder();
    Document doc = Jsoup.parse(new String(resource.getData(), mCharset));
    Elements elements = doc.getAllElements();
    for (Element element : elements) {
        List<TextNode> contentEs = element.textNodes();
        for (int i = 0; i < contentEs.size(); i++) {
            String text = contentEs.get(i).text().trim();
            text = StringUtils.formatHtml(text);
            if (elements.size() > 1) {
                if (text.length() > 0) {
                    if (content.length() > 0) {
                        content.append("\r\n");
                    }
                    content.append("\u3000\u3000").append(text);
                }
            } else {
                content.append(text);
            }
        }
    }
    return content.toString();
}
 
Example 3
Source File: CollectionsPresenter.java    From OpenHub with GNU General Public License v3.0 6 votes vote down vote up
private ArrayList<Collection> getBellowCollections(Document doc){
    ArrayList<Collection> collections = new ArrayList<>();
    Elements elements = doc.getElementsByClass(
            "d-flex border-bottom border-gray-light pb-4 mb-5");
    for (Element element : elements) {
        Element titleElement = element.select("div > h2 > a").first();
        Element descElement = element.select("div").last();
        String id = titleElement.attr("href");
        id = id.substring(id.lastIndexOf("/") + 1);
        String title = titleElement.textNodes().get(0).toString();

        List<TextNode> descTextNodes = descElement.textNodes();
        int descIndex = descTextNodes.size() == 0 ? 0 : descTextNodes.size() - 1;
        String desc = descTextNodes.get(descIndex).toString().trim();
        Collection collection = new Collection(id, title, desc);
        collections.add(collection);
    }
    return collections;
}
 
Example 4
Source File: RepositoriesPresenter.java    From OpenHub with GNU General Public License v3.0 5 votes vote down vote up
private Repository parseCollectionsRepositoryData(Element element) throws Exception{
        String fullName = element.select("div > h1 > a").attr("href");
        fullName = fullName.substring(1);
        String owner = fullName.substring(0, fullName.lastIndexOf("/"));
        String repoName = fullName.substring(fullName.lastIndexOf("/") + 1);
//        String ownerAvatar = element.select("div > div > a > img").attr("src");
        String ownerAvatar = "";

        Elements articleElements = element.getElementsByTag("div");
        Element descElement = articleElements.get(articleElements.size() - 2);
        StringBuilder desc = new StringBuilder("");
        for(TextNode textNode : descElement.textNodes()){
            desc.append(textNode.getWholeText());
        }

        Element numElement = articleElements.last();
        String starNumStr =  numElement.select("a").get(0).textNodes().get(1).toString();
        String forkNumStr =  numElement.select("a").get(1).textNodes().get(1).toString();
        String language = "";
        Elements languageElements = numElement.select("span > span > span");
        if(languageElements.size() > 0){
            language = numElement.select("span > span > span").get(1).textNodes().get(0).toString();
        }

        Repository repo = new Repository();
        repo.setFullName(fullName);
        repo.setName(repoName);
        User user = new User();
        user.setLogin(owner);
        user.setAvatarUrl(ownerAvatar);
        repo.setOwner(user);

        repo.setDescription(desc.toString());
        repo.setStargazersCount(Integer.parseInt(starNumStr.replaceAll(" ", "")));
        repo.setForksCount(Integer.parseInt(forkNumStr.replaceAll(" ", "")));
        repo.setLanguage(language);

        return repo;
    }
 
Example 5
Source File: ESchoolParser.java    From substitution-schedule-parser with Mozilla Public License 2.0 5 votes vote down vote up
private String getNewValue(Element cell) {
    List<TextNode> textNodes = cell.textNodes();
    if (textNodes.size() == 1) {
        return textNodes.get(0).text().trim();
    } else if (textNodes.size() == 2) {
        return textNodes.get(1).text().trim();
    } else {
        return null;
    }
}
 
Example 6
Source File: ESchoolParser.java    From substitution-schedule-parser with Mozilla Public License 2.0 5 votes vote down vote up
private String getPreviousValue(Element cell) {
    List<TextNode> textNodes = cell.textNodes();
    if (textNodes.size() == 1) {
        return null;
    } else if (textNodes.size() == 2) {
        return textNodes.get(0).text().trim();
    } else {
        return null;
    }
}
 
Example 7
Source File: ParagraphMarkedClassification.java    From baleen with Apache License 2.0 5 votes vote down vote up
private void processParagraph(Element p) {
  String text = p.text();
  Matcher matcher = PARAGRAPH_MARKING.matcher(text);
  if (matcher.find()) {
    String classification = matcher.group(CLASSFICATION_GROUP);

    MarkupUtils.additionallyAnnotateAsType(
        p, "uk.gov.dstl.baleen.types.metadata.ProtectiveMarking");
    // TODO: We override this for simplicity but we could select the best classification etc
    // (or output everything later and let a cleaner decide)
    MarkupUtils.setAttribute(p, "classification", classification.trim());

    // TODO: Ideally delete text the classification from the front.
    // That needs a util as we need to eat up the children of p until we've got to the end.
    // That's quite complex, you'd need to split down the text nodes across multiple children.
    // We'll just remove the the first text node matching the classification we've found as an
    // interim.

    String marking = "(" + classification + ')';
    for (org.jsoup.nodes.TextNode t : p.textNodes()) {
      if (t.text().contains(marking)) {
        String newText = t.text().replace(marking, "");
        t.text(newText);
      }
    }
  }
}
 
Example 8
Source File: BenghuaiNovel.java    From novel with Apache License 2.0 5 votes vote down vote up
@Override
public String parseArticle(String response) {
    Document document = Jsoup.parse(response);
    Element content = document.select("div#content").first();
    StringBuilder buffer = new StringBuilder();
    for (TextNode p : content.textNodes()) {
        buffer.append(p.getWholeText()).append("\r\n");
    }
    return buffer.toString();
}
 
Example 9
Source File: RepositoriesPresenter.java    From OpenHub with GNU General Public License v3.0 4 votes vote down vote up
private Repository parseTrendingRepositoryData(Element element) throws Exception{
    String fullName = element.select("h1 > a").attr("href");
    fullName = fullName.substring(1);
    String owner = fullName.substring(0, fullName.lastIndexOf("/"));
    String repoName = fullName.substring(fullName.lastIndexOf("/") + 1);

    Element descElement = element.getElementsByClass("col-9 text-gray my-1 pr-4").first();
    StringBuilder desc = new StringBuilder("");
    for(TextNode textNode : descElement.textNodes()){
        desc.append(textNode.getWholeText());
    }

    Element numElement = element.getElementsByClass("f6 text-gray mt-2").first();
    String language = "";
    Elements languageElements = numElement.select("span > span");
    if(languageElements.size() > 0){
        language = numElement.select("span > span").get(1).textNodes().get(0).toString().trim();
    }
    String starNumStr =  numElement.select("a").get(0).textNodes().get(1).toString()
            .replaceAll(" ", "").replaceAll(",", "");
    String forkNumStr =  numElement.select("a").get(1).textNodes().get(1).toString()
            .replaceAll(" ", "").replaceAll(",", "");
    Element periodElement =  numElement.getElementsByClass("d-inline-block float-sm-right").first();
    String periodNumStr = "0";
    if(periodElement != null){
        periodNumStr = periodElement.childNodes().get(2).toString().trim();
        periodNumStr = periodNumStr.substring(0, periodNumStr.indexOf(" "))
                .replaceAll(",", "");
    }

    Repository repo = new Repository();
    repo.setFullName(fullName);
    repo.setName(repoName);
    User user = new User();
    user.setLogin(owner);
    repo.setOwner(user);

    repo.setDescription(desc.toString().trim()
            .replaceAll("\n", ""));
    repo.setStargazersCount(Integer.parseInt(starNumStr));
    repo.setForksCount(Integer.parseInt(forkNumStr));
    repo.setSinceStargazersCount(Integer.parseInt(periodNumStr));
    repo.setLanguage(language);
    repo.setSince(since);

    return repo;
}