Java Code Examples for org.jsoup.nodes.TextNode#getWholeText()

The following examples show how to use org.jsoup.nodes.TextNode#getWholeText() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: astor   File: Cleaner.java    License: GNU General Public License v2.0 6 votes vote down vote up
public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}
 
Example 2
Source Project: astor   File: Cleaner.java    License: GNU General Public License v2.0 6 votes vote down vote up
public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}
 
Example 3
static boolean truncate(Document d, boolean reformat) {
    int max = (reformat ? MAX_FORMAT_TEXT_SIZE : MAX_FULL_TEXT_SIZE);

    int length = 0;
    int images = 0;
    for (Element elm : d.select("*")) {
        if ("img".equals(elm.tagName()))
            images++;

        boolean skip = false;
        for (Node child : elm.childNodes()) {
            if (child instanceof TextNode) {
                TextNode tnode = ((TextNode) child);
                String text = tnode.getWholeText();

                if (length < max) {
                    if (length + text.length() >= max) {
                        text = text.substring(0, max - length) + " ...";
                        tnode.text(text);
                        skip = true;
                    }
                } else {
                    if (skip)
                        tnode.text("");
                }

                length += text.length();
            }
        }

        if (length >= max && !skip)
            elm.remove();
    }

    Log.i("Message size=" + length + " images=" + images);

    return (length >= max);
}
 
Example 4
Source Project: baleen   File: DocumentToJCasConverter.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Map a node to text.
 *
 * @param node the node
 * @return the string
 */
private String mapToText(final Node node) {
  if (node instanceof TextNode) {
    final TextNode t = (TextNode) node;
    return t.getWholeText();
  } else {
    return null;
  }
}
 
Example 5
private String convertNodeToText(HTMLNode htmlNode) {
    Node node = htmlNode.underlyingNode;
    if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        return textNode.getWholeText();
    }
    if (node instanceof Element) {
        Element element = (Element) node;
        if (element.tagName().equals(BR_TAG)) {
            return "\n";
        }
        if (isList(element)) {
            return convertListElement(htmlNode.listNestedLevel);
        }
        if (element.tagName().equals(OL_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(LI_TAG)) {
            return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- ";
        }
        if (element.tagName().equals(P_TAG)) {
            return "\n\n";
        }
        if (element.tagName().equals(IMG_TAG)) {
            return generateImageAlternativeText(element);
        }
    }
    return "";
}
 
Example 6
Source Project: storm-crawler   File: TextExtractor.java    License: Apache License 2.0 5 votes vote down vote up
private static void appendNormalisedText(StringBuilder accum,
        TextNode textNode) {
    String text = textNode.getWholeText();

    if (preserveWhitespace(textNode.parent())
            || textNode instanceof CDataNode)
        accum.append(text);
    else
        StringUtil.appendNormalisedWhitespace(accum, text,
                lastCharIsWhitespace(accum));
}
 
Example 7
Source Project: eagle   File: HiveJobFetchSpout.java    License: Apache License 2.0 4 votes vote down vote up
private boolean fetchFinishedConfig(AppInfo appInfo, List<MRJob> mrJobs) {
    InputStream is = null;
    for (MRJob mrJob : mrJobs) {
        String urlString = crawlConfig.endPointConfig.HSBasePath + "jobhistory/conf/" + mrJob.getId() + "?" + Constants.ANONYMOUS_PARAMETER;
        try {
            LOG.info("fetch job conf from {}", urlString);
            is = InputStreamUtils.getInputStream(urlString, null, Constants.CompressionType.NONE);
            final org.jsoup.nodes.Document doc = Jsoup.parse(is, "UTF-8", urlString);
            doc.outputSettings().prettyPrint(false);
            org.jsoup.select.Elements elements = doc.select("table[id=conf]").select("tbody").select("tr");
            Map<String, String> hiveQueryLog = new HashMap<>();
            Iterator<org.jsoup.nodes.Element> iter = elements.iterator();
            while (iter.hasNext()) {
                org.jsoup.nodes.Element element = iter.next();
                org.jsoup.select.Elements tds = element.children();
                String key = tds.get(0).text();
                String value = "";
                org.jsoup.nodes.Element valueElement = tds.get(1);
                if (Constants.HIVE_QUERY_STRING.equals(key)) {
                    for (org.jsoup.nodes.Node child : valueElement.childNodes()) {
                        if (child instanceof TextNode) {
                            TextNode valueTextNode = (TextNode) child;
                            value = valueTextNode.getWholeText();
                            value = StringUtils.strip(value);
                        }
                    }
                } else {
                    value = valueElement.text();
                }
                hiveQueryLog.put(key, value);
            }
            if (hiveQueryLog.containsKey(Constants.HIVE_QUERY_STRING)) {
                collector.emit(new ValuesArray(appInfo.getUser(), mrJob.getId(), Constants.ResourceType.JOB_CONFIGURATION, hiveQueryLog), mrJob.getId());
            }
        } catch (Exception e) {
            LOG.warn("fetch job conf from {} failed, {}", urlString, e);
            e.printStackTrace();
            return false;
        } finally {
            Utils.closeInputStream(is);
        }
    }
    return true;
}