org.jsoup.nodes.Node Java Examples

The following examples show how to use org.jsoup.nodes.Node. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlToPlainText.java    From intellij-quarkus with Eclipse Public License 2.0 7 votes vote down vote up
@Override
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode) {
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    } else if (name.equals("ul")) {
        listNesting++;
    } else if (name.equals("li")) {
        append("\n ");
        for (int i = 1; i < listNesting; i++) {
            append("  ");
        }
        if (listNesting == 1) {
            append("* ");
        } else {
            append("- ");
        }
    } else if (name.equals("dt")) {
        append("  ");
    } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
        append("\n");
    }
}
 
Example #2
Source File: ComMailingContentServiceImpl.java    From openemm with GNU Affero General Public License v3.0 6 votes vote down vote up
private void generateTextContent(StringBuilder sb, List<Node> nodes) {
    for (Node node : nodes) {
        if (node instanceof Element) {
            Element element = (Element) node;

            switch (element.nodeName()) {
                case "a":
                    sb.append(getTextLink(element));
                    break;

                case "br":
                    sb.append('\n');
                    break;

                default:
                    generateTextContent(sb, element.childNodes());
                    break;
            }
        } else if (node instanceof TextNode) {
            sb.append(((TextNode) node).getWholeText());
        }
    }
}
 
Example #3
Source File: SearchUtils.java    From emotional_analysis with Apache License 2.0 6 votes vote down vote up
/**
 * 获取歌曲名称
 * <p>Title: getSongNameById</p>
 * <p>Description: </p>
 * @param songId
 * @return
 * @throws Exception
 */
public static String getSongNameById(long songId) throws Exception{
	String songName = null;
	Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + songId)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "no-cache").timeout(2000000000)
			.execute();
	Document parse = execute.parse();
	Elements elementsByClass = parse.getElementsByClass("f-ff2");
	if(elementsByClass.size() > 0){
		Element element = elementsByClass.get(0);
		Node childNode = element.childNode(0);
		songName = childNode.toString();
	}else{
		songName = "ES中歌曲在网易云音乐中找不到";
	}
	return songName;
}
 
Example #4
Source File: SongTest.java    From emotional_analysis with Apache License 2.0 6 votes vote down vote up
/**
 * 解析出歌手 专辑
 * <p>Title: test4</p>
 * <p>Description: </p>
 * @throws Exception
 */
@Test
public void test4() throws Exception{
	 Response execute = Jsoup.connect("http://music.163.com/song?id=63650")
				.ignoreContentType(true).execute();
	 Document parse = execute.parse();
	 Elements elements = parse.getElementsByClass("s-fc7");
	 Element singerElement = elements.get(1);
	 Node singerChildNode = singerElement.childNode(0);
	 String singer = singerChildNode.toString();
	 //Album
	 Element albumElement = elements.get(2);
	 Node albumChildNode = albumElement.childNode(0);
	 String album = albumChildNode.toString();
	 System.out.println(singer+"--------"+album);
}
 
Example #5
Source File: DeepTextElementBuilder.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public String buildTextFromElement(Element element) {
    StringBuilder elementText = new StringBuilder();
    if (element.hasAttr(ALT_ATTR)) {
        elementText.append(SPACER);
        elementText.append(altAttrTextBuilder.buildTextFromElement(element));
    }
    for (Node child : element.childNodes()) {
        if (child instanceof TextNode && !((TextNode)child).isBlank()) {
           elementText.append(SPACER);
           elementText.append(StringUtils.trim(((TextNode)child).text()));
        } else if (child instanceof Element){
            elementText.append(SPACER);
            elementText.append(buildTextFromElement((Element)child));
        }
    }
    return StringUtils.trim(elementText.toString());
}
 
Example #6
Source File: ParseUtil.java    From zrlog with Apache License 2.0 6 votes vote down vote up
public static String autoDigest(String str, int size) {
    StringBuilder sb = new StringBuilder();
    Document document = Jsoup.parseBodyFragment(str);
    List<Node> allTextNode = new ArrayList<>();
    getAllTextNode(document.childNodes(), allTextNode);
    int tLength = 0;
    for (Node node : allTextNode) {
        if (node instanceof TextNode) {
            sb.append(node.parent().outerHtml());
            tLength += ((TextNode) node).text().length();
            if (tLength > size) {
                sb.append(" ...");
                break;
            }
        }
    }
    String digest = sb.toString();
    Elements elements = Jsoup.parse(str).body().select("video");
    if (elements != null && !elements.isEmpty()) {
        digest = elements.get(0).toString() + "<br/>" + digest;
    }
    return digest.trim();
}
 
Example #7
Source File: Paragraph.java    From dkpro-c4corpus with Apache License 2.0 6 votes vote down vote up
public void initRawInfo()
{
    StringBuilder sb = new StringBuilder();
    for (Node n : this) {
        //            NodeHelper.cleanEmptyElements(n);
        if (n instanceof TextNode) {
            this.setTagName(getPath(n));
            String nodeRawText = ((TextNode) n).text();
            sb.append(Utils.normalizeBreaks(nodeRawText).trim());

            if (NodeHelper.isLink(n)) {
                charsCountInLinks += nodeRawText.length();
            }
        }
    }

    rawText = sb.toString();
}
 
Example #8
Source File: NodeTraversor.java    From jsoup-learning with MIT License 6 votes vote down vote up
/**
 * Start a depth-first traverse of the root and all of its descendants.
 * @param root the root node point to traverse.
 */
public void traverse(Node root) {
    Node node = root;
    int depth = 0;
    
    while (node != null) {
        visitor.head(node, depth);
        if (node.childNodeSize() > 0) {
            node = node.childNode(0);
            depth++;
        } else {
            while (node.nextSibling() == null && depth > 0) {
                visitor.tail(node, depth);
                node = node.parent();
                depth--;
            }
            visitor.tail(node, depth);
            if (node == root)
                break;
            node = node.nextSibling();
        }
    }
}
 
Example #9
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private static void readNodes(List<Node> nodeList, List<String> textList)
{
	String tempText;
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodes(node.childNodes(), textList);
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				tempText=((TextNode) node).getWholeText();
				tempText=newline.matcher(tempText).replaceAll("");
				if(!tempText.isEmpty())
					textList.add(tempText);
			}
		}
	}
}
 
Example #10
Source File: ElementOperator.java    From xsoup with MIT License 6 votes vote down vote up
@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}
 
Example #11
Source File: HtmlToPlainText.java    From lemminx with Eclipse Public License 2.0 6 votes vote down vote up
@Override
public void head(Node node, int depth) {
	String name = node.nodeName();
	if (node instanceof TextNode) {
		append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
	} else if (name.equals("ul")) {
		listNesting++;
	} else if (name.equals("li")) {
		append("\n ");
		for (int i = 1; i < listNesting; i++) {
			append("  ");
		}
		if (listNesting == 1) {
			append("* ");
		} else {
			append("- ");
		}
	} else if (name.equals("dt")) {
		append("  ");
	} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
		append("\n");
	}
}
 
Example #12
Source File: HtmlParser.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private static void readNodesWithTags(List<Node> nodeList, List<Map.Entry<String,String>> textListMap, String tag)
{
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodesWithTags(node.childNodes(), textListMap, node.nodeName());
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				if(tag.equalsIgnoreCase("body"))
					tag="p";
				textListMap.add(new AbstractMap.SimpleEntry<String,String>(tag, ((TextNode) node).getWholeText() ));
			}
		}
	}
}
 
Example #13
Source File: ElementOperator.java    From zongtui-webcrawler with GNU General Public License v2.0 6 votes vote down vote up
@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}
 
Example #14
Source File: OutputFormatter.java    From Xndroid with GNU General Public License v3.0 6 votes vote down vote up
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
 
Example #15
Source File: ElementUtil.java    From flow with Apache License 2.0 6 votes vote down vote up
/**
 * Converts the given element and its children to a JSoup node with
 * children.
 *
 * @param document
 *            A JSoup document
 * @param element
 *            The element to convert
 * @return A JSoup node containing the converted element
 */
public static Node toJsoup(Document document, Element element) {
    if (element.isTextNode()) {
        return new TextNode(element.getText(), document.baseUri());
    }

    org.jsoup.nodes.Element target = document
            .createElement(element.getTag());
    if (element.hasProperty("innerHTML")) {
        target.html((String) element.getPropertyRaw("innerHTML"));
    }

    element.getAttributeNames().forEach(name -> {
        String attributeValue = element.getAttribute(name);
        if ("".equals(attributeValue)) {
            target.attr(name, true);
        } else {
            target.attr(name, attributeValue);
        }
    });

    element.getChildren()
            .forEach(child -> target.appendChild(toJsoup(document, child)));

    return target;
}
 
Example #16
Source File: ParagraphsExplorer.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public void head(Node node, int depth)
{
    if (node.childNodeSize() == 0) {
        if (node instanceof TextNode && StringUtil.isBlank(node.outerHtml())) {
            return;
        }
        mergeToResult(node);
        nodes.add(node);
    }
}
 
Example #17
Source File: CssSelector.java    From zongtui-webcrawler with GNU General Public License v2.0 5 votes vote down vote up
protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}
 
Example #18
Source File: CommonParser.java    From ZfsoftCampusAssit with Apache License 2.0 5 votes vote down vote up
public void parseCollegeTerms(String rawHtml, Setting setting) {
    Element doc = Jsoup.parse(rawHtml).getElementById("xqd");
    for (Node yearNode : doc.childNodes()) {
        if (yearNode.hasAttr("value")) {
            setting.ownTerms.add(yearNode.attr("value"));
            if (yearNode.hasAttr("selected")) {
                setting.currentTerm = yearNode.attr("selected");
            }
        }
    }
}
 
Example #19
Source File: TruncateHtmlFilter.java    From jinjava with Apache License 2.0 5 votes vote down vote up
@Override
public void tail(Node node, int depth) {
  if (node instanceof Element) {
    Element el = (Element) node;
    if (StringUtils.isBlank(el.text())) {
      el.addClass("__deleteme");
    }
  }
}
 
Example #20
Source File: HtmlTreeBuilder.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
private void insertNode(Node node) {
    // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
    if (stack.size() == 0)
        doc.appendChild(node);
    else if (isFosterInserts())
        insertInFosterParent(node);
    else
        currentElement().appendChild(node);

    // connect form controls to their form element
    if (node instanceof Element && ((Element) node).tag().isFormListed()) {
        if (formElement != null)
            formElement.addElement((Element) node);
    }
}
 
Example #21
Source File: HTMLJsoupCleanerImpl.java    From Asqatasun with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Remove the comments of the page 
 * 
 * @param node 
 */
private void removeComments(Node node) {
    // as we are removing child nodes while iterating, we cannot use a normal foreach over children,
    // or will get a concurrent list modification error.
    int i = 0;
    while (i < node.childNodes().size()) {
        Node child = node.childNode(i);
        if (child.nodeName().equals("#comment"))
            child.remove();
        else {
            removeComments(child);
            i++;
        }
    }
}
 
Example #22
Source File: ParagraphsExplorer.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
private void insertAsNewParagraph(Node node)
{
    Paragraph p = new Paragraph(node);
    p.initRawInfo();
    // if (!p.getRawText().isEmpty()) {
    paragraphs.add(p);
    // }
}
 
Example #23
Source File: ParagraphsExplorer.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
private void appendToLastParagraph(Node node)
{
    //        if(!node.nodeName().equalsIgnoreCase("br")){
    if (node instanceof TextNode) {
        Paragraph p = paragraphs.getLast();
        p.setRawText(p.getRawText() + " " + node);
        if (NodeHelper.isLink(node)) {
            p.charsCountInLinks += ((TextNode) node).text().length();
        }
        paragraphs.getLast().add(node);
    }
}
 
Example #24
Source File: ParagraphsExplorer.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
private Node getLastAddedNode()
{
    //        if (paragraphs.isEmpty()) {
    //            return null;
    //        }
    //        return paragraphs.getLast().getLast();
    if (nodes.isEmpty()) {
        return null;
    }
    return nodes.getLast();
}
 
Example #25
Source File: JusTextBoilerplateRemoval.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * Initialize the Paragraph explorer class in order to convert a document to
 * a list of blocks (paragraphs)
 */
private LinkedList<Paragraph> makeParagraphs(Node node)
{
    ParagraphsExplorer pe = new ParagraphsExplorer();
    node.traverse(pe); //begin the traversal of the doc
    return pe.getParagraphs();
}
 
Example #26
Source File: NodeHelper.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns true if node has a link ancestor
 *
 * @param node node
 * @return boolean value
 */
public static boolean isLink(Node node)
{
    Node ancestor = node;

    while (ancestor != null) {
        if (isLinkTag(ancestor)) {
            return true;
        }
        ancestor = ancestor.parent();
    }

    return false;
}
 
Example #27
Source File: JsoupHelper.java    From WordCount with GNU General Public License v2.0 5 votes vote down vote up
public static String getXpath(Node node) {
    String result = "";
    Node temp = node;
    while (temp != null) {
        String name = getNodeName(temp);
        result = "," + name + result;
        temp = temp.parent();
    }
    return result;
    
}
 
Example #28
Source File: DocumentToJCasConverter.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Walk the HTML document node by node, creating annotations and text.
 *
 * @param builder the builder
 * @param root the root
 * @param depth the depth
 */
private void walk(
    final JCasBuilder builder, final Node root, final int depth, final boolean captureText) {
  if (root == null) {
    return;
  }

  final int begin = builder.getCurrentOffset();
  if (captureText) {
    // Generate the text and the annotations
    final String text = mapToText(root);
    if (!Strings.isNullOrEmpty(text)) {
      builder.addText(text);
    }
  }

  List<Annotation> annotations = null;
  if (root instanceof Element) {
    annotations = mapElementToAnnotations(builder.getJCas(), (Element) root);
  }

  // BUG: With multiple mappers depth here is wrong! It puts all mappers at the same depth...
  // (though in fairness they are all the same begin-end and same element too)

  // Walk the children
  if (root.childNodeSize() > 0) {
    for (final Node node : root.childNodes()) {
      walk(builder, node, depth + 1, captureText);
    }
  }

  // Add annotations to the JCas
  final int end = builder.getCurrentOffset();
  if (annotations != null && !annotations.isEmpty()) {
    builder.addAnnotations(annotations, begin, end, depth);
  }
}
 
Example #29
Source File: Evaluator.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Override
public boolean matches(Element root, Element element) {
      	List<Node> family = element.childNodes();
          for (Node n : family) {
              if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
          }
      	return true;
}
 
Example #30
Source File: HtmlToPlainText.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    else if (name.equals("li"))
        append("\n * ");
    else if (name.equals("dt"))
        append("  ");
    else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
        append("\n");
}