Java Code Examples for org.jsoup.nodes.TextNode#getWholeText()

The following examples show how to use org.jsoup.nodes.TextNode#getWholeText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Cleaner.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}
 
Example 2
Source File: Cleaner.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}
 
Example 3
Source File: HtmlHelper.java    From FairEmail with GNU General Public License v3.0 5 votes vote down vote up
static boolean truncate(Document d, boolean reformat) {
    int max = (reformat ? MAX_FORMAT_TEXT_SIZE : MAX_FULL_TEXT_SIZE);

    int length = 0;
    int images = 0;
    for (Element elm : d.select("*")) {
        if ("img".equals(elm.tagName()))
            images++;

        boolean skip = false;
        for (Node child : elm.childNodes()) {
            if (child instanceof TextNode) {
                TextNode tnode = ((TextNode) child);
                String text = tnode.getWholeText();

                if (length < max) {
                    if (length + text.length() >= max) {
                        text = text.substring(0, max - length) + " ...";
                        tnode.text(text);
                        skip = true;
                    }
                } else {
                    if (skip)
                        tnode.text("");
                }

                length += text.length();
            }
        }

        if (length >= max && !skip)
            elm.remove();
    }

    Log.i("Message size=" + length + " images=" + images);

    return (length >= max);
}
 
Example 4
Source File: DocumentToJCasConverter.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Map a node to text.
 *
 * @param node the node
 * @return the string
 */
private String mapToText(final Node node) {
  if (node instanceof TextNode) {
    final TextNode t = (TextNode) node;
    return t.getWholeText();
  } else {
    return null;
  }
}
 
Example 5
Source File: JsoupHtmlTextExtractor.java    From james-project with Apache License 2.0 5 votes vote down vote up
private String convertNodeToText(HTMLNode htmlNode) {
    Node node = htmlNode.underlyingNode;
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        return textNode.getWholeText();
    }
    if (node instanceof Element) {
        Element element = (Element) node;
        if (element.tagName().equals(BR_TAG)) {
            return "\n";
        }
        if (isList(element)) {
            return convertListElement(htmlNode.listNestedLevel);
        }
        if (element.tagName().equals(OL_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(LI_TAG)) {
            return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- ";
        }
        if (element.tagName().equals(P_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(IMG_TAG)) {
            return generateImageAlternativeText(element);
        }
    }
    return "";
}
 
Example 6
Source File: TextExtractor.java    From storm-crawler with Apache License 2.0 5 votes vote down vote up
private static void appendNormalisedText(StringBuilder accum,
        TextNode textNode) {
    String text = textNode.getWholeText();

    if (preserveWhitespace(textNode.parent())
            || textNode instanceof CDataNode)
        accum.append(text);
    else
        StringUtil.appendNormalisedWhitespace(accum, text,
                lastCharIsWhitespace(accum));
}
 
Example 7
Source File: HiveJobFetchSpout.java    From eagle with Apache License 2.0 4 votes vote down vote up
private boolean fetchFinishedConfig(AppInfo appInfo, List<MRJob> mrJobs) {
    InputStream is = null;
    for (MRJob mrJob : mrJobs) {
        String urlString = crawlConfig.endPointConfig.HSBasePath + "jobhistory/conf/" + mrJob.getId() + "?" + Constants.ANONYMOUS_PARAMETER;
        try {
            LOG.info("fetch job conf from {}", urlString);
            is = InputStreamUtils.getInputStream(urlString, null, Constants.CompressionType.NONE);
            final org.jsoup.nodes.Document doc = Jsoup.parse(is, "UTF-8", urlString);
            doc.outputSettings().prettyPrint(false);
            org.jsoup.select.Elements elements = doc.select("table[id=conf]").select("tbody").select("tr");
            Map<String, String> hiveQueryLog = new HashMap<>();
            Iterator<org.jsoup.nodes.Element> iter = elements.iterator();
            while (iter.hasNext()) {
                org.jsoup.nodes.Element element = iter.next();
                org.jsoup.select.Elements tds = element.children();
                String key = tds.get(0).text();
                String value = "";
                org.jsoup.nodes.Element valueElement = tds.get(1);
                if (Constants.HIVE_QUERY_STRING.equals(key)) {
                    for (org.jsoup.nodes.Node child : valueElement.childNodes()) {
                        if (child instanceof TextNode) {
                            TextNode valueTextNode = (TextNode) child;
                            value = valueTextNode.getWholeText();
                            value = StringUtils.strip(value);
                        }
                    }
                } else {
                    value = valueElement.text();
                }
                hiveQueryLog.put(key, value);
            }
            if (hiveQueryLog.containsKey(Constants.HIVE_QUERY_STRING)) {
                collector.emit(new ValuesArray(appInfo.getUser(), mrJob.getId(), Constants.ResourceType.JOB_CONFIGURATION, hiveQueryLog), mrJob.getId());
            }
        } catch (Exception e) {
            LOG.warn("fetch job conf from {} failed, {}", urlString, e);
            e.printStackTrace();
            return false;
        } finally {
            Utils.closeInputStream(is);
        }
    }
    return true;
}