Java Code Examples for org.jsoup.nodes.Node

The following examples show how to use org.jsoup.nodes.Node. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: intellij-quarkus   Author: redhat-developer   File: HtmlToPlainText.java    License: Eclipse Public License 2.0 7 votes vote down vote up
@Override
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode) {
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    } else if (name.equals("ul")) {
        listNesting++;
    } else if (name.equals("li")) {
        append("\n ");
        for (int i = 1; i < listNesting; i++) {
            append("  ");
        }
        if (listNesting == 1) {
            append("* ");
        } else {
            append("- ");
        }
    } else if (name.equals("dt")) {
        append("  ");
    } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
        append("\n");
    }
}
 
Example #2
Source Project: openemm   Author: agnitas-org   File: ComMailingContentServiceImpl.java    License: GNU Affero General Public License v3.0 6 votes vote down vote up
private void generateTextContent(StringBuilder sb, List<Node> nodes) {
    for (Node node : nodes) {
        if (node instanceof Element) {
            Element element = (Element) node;

            switch (element.nodeName()) {
                case "a":
                    sb.append(getTextLink(element));
                    break;

                case "br":
                    sb.append('\n');
                    break;

                default:
                    generateTextContent(sb, element.childNodes());
                    break;
            }
        } else if (node instanceof TextNode) {
            sb.append(((TextNode) node).getWholeText());
        }
    }
}
 
Example #3
Source Project: lemminx   Author: eclipse   File: HtmlToPlainText.java    License: Eclipse Public License 2.0 6 votes vote down vote up
@Override
public void head(Node node, int depth) {
	String name = node.nodeName();
	if (node instanceof TextNode) {
		append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
	} else if (name.equals("ul")) {
		listNesting++;
	} else if (name.equals("li")) {
		append("\n ");
		for (int i = 1; i < listNesting; i++) {
			append("  ");
		}
		if (listNesting == 1) {
			append("* ");
		} else {
			append("- ");
		}
	} else if (name.equals("dt")) {
		append("  ");
	} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
		append("\n");
	}
}
 
Example #4
Source Project: xsoup   Author: code4craft   File: ElementOperator.java    License: MIT License 6 votes vote down vote up
@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}
 
Example #5
Source Project: emotional_analysis   Author: 20100507   File: SongTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 解析出歌手 专辑
 * <p>Title: test4</p>
 * <p>Description: </p>
 * @throws Exception
 */
@Test
public void test4() throws Exception{
	 Response execute = Jsoup.connect("http://music.163.com/song?id=63650")
				.ignoreContentType(true).execute();
	 Document parse = execute.parse();
	 Elements elements = parse.getElementsByClass("s-fc7");
	 Element singerElement = elements.get(1);
	 Node singerChildNode = singerElement.childNode(0);
	 String singer = singerChildNode.toString();
	 //Album
	 Element albumElement = elements.get(2);
	 Node albumChildNode = albumElement.childNode(0);
	 String album = albumChildNode.toString();
	 System.out.println(singer+"--------"+album);
}
 
Example #6
Source Project: emotional_analysis   Author: 20100507   File: SearchUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * 获取歌曲名称
 * <p>Title: getSongNameById</p>
 * <p>Description: </p>
 * @param songId
 * @return
 * @throws Exception
 */
public static String getSongNameById(long songId) throws Exception{
	String songName = null;
	Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + songId)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "no-cache").timeout(2000000000)
			.execute();
	Document parse = execute.parse();
	Elements elementsByClass = parse.getElementsByClass("f-ff2");
	if(elementsByClass.size() > 0){
		Element element = elementsByClass.get(0);
		Node childNode = element.childNode(0);
		songName = childNode.toString();
	}else{
		songName = "ES中歌曲在网易云音乐中找不到";
	}
	return songName;
}
 
Example #7
Source Project: zrlog   Author: 94fzb   File: ParseUtil.java    License: Apache License 2.0 6 votes vote down vote up
public static String autoDigest(String str, int size) {
    StringBuilder sb = new StringBuilder();
    Document document = Jsoup.parseBodyFragment(str);
    List<Node> allTextNode = new ArrayList<>();
    getAllTextNode(document.childNodes(), allTextNode);
    int tLength = 0;
    for (Node node : allTextNode) {
        if (node instanceof TextNode) {
            sb.append(node.parent().outerHtml());
            tLength += ((TextNode) node).text().length();
            if (tLength > size) {
                sb.append(" ...");
                break;
            }
        }
    }
    String digest = sb.toString();
    Elements elements = Jsoup.parse(str).body().select("video");
    if (elements != null && !elements.isEmpty()) {
        digest = elements.get(0).toString() + "<br/>" + digest;
    }
    return digest.trim();
}
 
Example #8
Source Project: jsoup-learning   Author: code4craft   File: NodeTraversor.java    License: MIT License 6 votes vote down vote up
/**
 * Start a depth-first traverse of the root and all of its descendants.
 * @param root the root node point to traverse.
 */
public void traverse(Node root) {
    Node node = root;
    int depth = 0;
    
    while (node != null) {
        visitor.head(node, depth);
        if (node.childNodeSize() > 0) {
            node = node.childNode(0);
            depth++;
        } else {
            while (node.nextSibling() == null && depth > 0) {
                visitor.tail(node, depth);
                node = node.parent();
                depth--;
            }
            visitor.tail(node, depth);
            if (node == root)
                break;
            node = node.nextSibling();
        }
    }
}
 
Example #9
Source Project: scava   Author: crossminer   File: HtmlParser.java    License: Eclipse Public License 2.0 6 votes vote down vote up
private static void readNodes(List<Node> nodeList, List<String> textList)
{
	String tempText;
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodes(node.childNodes(), textList);
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				tempText=((TextNode) node).getWholeText();
				tempText=newline.matcher(tempText).replaceAll("");
				if(!tempText.isEmpty())
					textList.add(tempText);
			}
		}
	}
}
 
Example #10
Source Project: scava   Author: crossminer   File: HtmlParser.java    License: Eclipse Public License 2.0 6 votes vote down vote up
private static void readNodesWithTags(List<Node> nodeList, List<Map.Entry<String,String>> textListMap, String tag)
{
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodesWithTags(node.childNodes(), textListMap, node.nodeName());
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				if(tag.equalsIgnoreCase("body"))
					tag="p";
				textListMap.add(new AbstractMap.SimpleEntry<String,String>(tag, ((TextNode) node).getWholeText() ));
			}
		}
	}
}
 
Example #11
Source Project: Xndroid   Author: XndroidDev   File: OutputFormatter.java    License: GNU General Public License v3.0 6 votes vote down vote up
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
 
Example #12
Source Project: dkpro-c4corpus   Author: dkpro   File: Paragraph.java    License: Apache License 2.0 6 votes vote down vote up
public void initRawInfo()
{
    StringBuilder sb = new StringBuilder();
    for (Node n : this) {
        //            NodeHelper.cleanEmptyElements(n);
        if (n instanceof TextNode) {
            this.setTagName(getPath(n));
            String nodeRawText = ((TextNode) n).text();
            sb.append(Utils.normalizeBreaks(nodeRawText).trim());

            if (NodeHelper.isLink(n)) {
                charsCountInLinks += nodeRawText.length();
            }
        }
    }

    rawText = sb.toString();
}
 
Example #13
Source Project: zongtui-webcrawler   Author: zongtui   File: ElementOperator.java    License: GNU General Public License v2.0 6 votes vote down vote up
@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}
 
Example #14
Source Project: Asqatasun   Author: Asqatasun   File: DeepTextElementBuilder.java    License: GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public String buildTextFromElement(Element element) {
    StringBuilder elementText = new StringBuilder();
    if (element.hasAttr(ALT_ATTR)) {
        elementText.append(SPACER);
        elementText.append(altAttrTextBuilder.buildTextFromElement(element));
    }
    for (Node child : element.childNodes()) {
        if (child instanceof TextNode && !((TextNode)child).isBlank()) {
           elementText.append(SPACER);
           elementText.append(StringUtils.trim(((TextNode)child).text()));
        } else if (child instanceof Element){
            elementText.append(SPACER);
            elementText.append(buildTextFromElement((Element)child));
        }
    }
    return StringUtils.trim(elementText.toString());
}
 
Example #15
Source Project: flow   Author: vaadin   File: ElementUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Converts the given element and its children to a JSoup node with
 * children.
 *
 * @param document
 *            A JSoup document
 * @param element
 *            The element to convert
 * @return A JSoup node containing the converted element
 */
public static Node toJsoup(Document document, Element element) {
    if (element.isTextNode()) {
        return new TextNode(element.getText(), document.baseUri());
    }

    org.jsoup.nodes.Element target = document
            .createElement(element.getTag());
    if (element.hasProperty("innerHTML")) {
        target.html((String) element.getPropertyRaw("innerHTML"));
    }

    element.getAttributeNames().forEach(name -> {
        String attributeValue = element.getAttribute(name);
        if ("".equals(attributeValue)) {
            target.attr(name, true);
        } else {
            target.attr(name, attributeValue);
        }
    });

    element.getChildren()
            .forEach(child -> target.appendChild(toJsoup(document, child)));

    return target;
}
 
Example #16
Source Project: netcdf-java   Author: Unidata   File: NcepHtmlScraper.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
void parseTopDoc() throws IOException {
  String source = "https://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_doc/";
  Document doc = Jsoup.parse(new URL(source), 5 * 1000); // 5 sec timeout
  // System.out.printf("%s%n", doc);

  Elements links = doc.select("a[href]");
  for (Element link : links) {
    // System.out.printf("%s", link);
    Node sib = link.nextSibling();
    String title = null;
    if (sib != null) {
      String sibt = sib.toString();
      title = StringUtil2.remove(sibt, "-").trim();
      // System.out.printf(" == '%s'", title);
    }
    if (link.text().equals("Table 4.2")) {
      // System.out.printf(" == ");
      parseTable42(link.attr("abs:href"), link.text(), title);

    } else {
      if (link.text().startsWith("Table 4")) {
        // System.out.printf(" == ");
        parseCodeTable(link.attr("abs:href"), link.text(), title);
      }
    }
    // System.out.printf("%n");
  }
}
 
Example #17
Source Project: astor   Author: SpoonLabs   File: Evaluator.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Override
public boolean matches(Element root, Element element) {
      	List<Node> family = element.childNodes();
          for (Node n : family) {
              if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
          }
      	return true;
}
 
Example #18
Source Project: jsoup-learning   Author: code4craft   File: XmlTreeBuilderTest.java    License: MIT License 5 votes vote down vote up
@Test public void xmlFragment() {
    String xml = "<one src='/foo/' />Two<three><four /></three>";
    List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
    assertEquals(3, nodes.size());

    assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
    assertEquals("one", nodes.get(0).nodeName());
    assertEquals("Two", ((TextNode)nodes.get(1)).text());
}
 
Example #19
Source Project: astor   Author: SpoonLabs   File: XmlTreeBuilderTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test public void xmlFragment() {
    String xml = "<one src='/foo/' />Two<three><four /></three>";
    List<Node> nodes = Parser.parseXmlFragment(xml, "http://example.com/");
    assertEquals(3, nodes.size());

    assertEquals("http://example.com/foo/", nodes.get(0).absUrl("src"));
    assertEquals("one", nodes.get(0).nodeName());
    assertEquals("Two", ((TextNode)nodes.get(1)).text());
}
 
Example #20
Source Project: FairEmail   Author: M66B   File: HtmlHelper.java    License: GNU General Public License v3.0 5 votes vote down vote up
static boolean truncate(Document d, boolean reformat) {
    int max = (reformat ? MAX_FORMAT_TEXT_SIZE : MAX_FULL_TEXT_SIZE);

    int length = 0;
    int images = 0;
    for (Element elm : d.select("*")) {
        if ("img".equals(elm.tagName()))
            images++;

        boolean skip = false;
        for (Node child : elm.childNodes()) {
            if (child instanceof TextNode) {
                TextNode tnode = ((TextNode) child);
                String text = tnode.getWholeText();

                if (length < max) {
                    if (length + text.length() >= max) {
                        text = text.substring(0, max - length) + " ...";
                        tnode.text(text);
                        skip = true;
                    }
                } else {
                    if (skip)
                        tnode.text("");
                }

                length += text.length();
            }
        }

        if (length >= max && !skip)
            elm.remove();
    }

    Log.i("Message size=" + length + " images=" + images);

    return (length >= max);
}
 
Example #21
Source Project: astor   Author: SpoonLabs   File: Parser.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parse a fragment of HTML into the {@code body} of a Document.
 *
 * @param bodyHtml fragment of HTML
 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 *
 * @return Document, with empty head, and HTML parsed into body
 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
    for (int i = nodes.length - 1; i > 0; i--) {
        nodes[i].remove();
    }
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
 
Example #22
Source Project: onedev   Author: theonedev   File: TextNodeVisitor.java    License: MIT License 5 votes vote down vote up
@Override
public void tail(Node node, int depth) {
	if (node instanceof TextNode) {
		TextNode tn = (TextNode) node;
		if (isApplicable(tn))
			matchedNodes.add(tn);
	}
}
 
Example #23
Source Project: frameworkAggregate   Author: handexing   File: MyJsoup.java    License: Apache License 2.0 5 votes vote down vote up
private static List<FlowerCategory> getCategoryList() {

		List<FlowerCategory> categories = new ArrayList<FlowerCategory>();

		try {
			Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
			Elements catelist = doc.getElementsByClass("catelist");
			Element cates = catelist.first();
			List<Node> childNodes = cates.childNodes();
			for (int i = 0; i < childNodes.size(); i++) {
				Node node = childNodes.get(i);
				List<Node> childs = node.childNodes();
				if (childs != null && childs.size() > 0) {
					FlowerCategory category = new FlowerCategory();
					for (int j = 0; j < childs.size(); j++) {
						Node child = childs.get(j);
						if ("a".equals(child.nodeName())) {
							category.setUrl(child.attr("href"));
							category.setImgPath(child.childNode(1).attr("src"));
						} else if ("h2".equals(child.nodeName())) {
							category.setName(child.attr("title"));
						}
					}
					categories.add(category);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

		return categories;
	}
 
Example #24
Source Project: emotional_analysis   Author: 20100507   File: IpProxy.java    License: Apache License 2.0 5 votes vote down vote up
public static List<IpEntity> getProxyIp(String url) throws Exception{
	ArrayList<IpEntity> ipList = new ArrayList<>();
	Response execute = Jsoup.connect(url)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "max-age=60").header("Accept", "*/*")
			.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6").header("Connection", "keep-alive")
			.header("Referer", "http://music.163.com/song?id=186016")
			.header("Origin", "http://music.163.com").header("Host", "music.163.com")
			.header("Content-Type", "application/x-www-form-urlencoded")
			.header("Cookie",
					"UM_distinctid=15e9863cf14335-0a09f939cd2af9-6d1b137c-100200-15e9863cf157f1; vjuids=414b87eb3.15e9863cfc1.0.ec99d6f660d09; _ntes_nnid=4543481cc76ab2fd3110ecaafd5f1288,1505795231854; _ntes_nuid=4543481cc76ab2fd3110ecaafd5f1288; __s_=1; __gads=ID=6cbc4ab41878c6b9:T=1505795247:S=ALNI_MbCe-bAY4kZyMbVKlS4T2BSuY75kw; usertrack=c+xxC1nMphjBCzKpBPJjAg==; NTES_CMT_USER_INFO=100899097%7Cm187****4250%7C%7Cfalse%7CbTE4NzAzNDE0MjUwQDE2My5jb20%3D; [email protected]|1507178162|2|mail163|00&99|CA&1506163335&mail163#hun&430800#10#0#0|187250&1|163|[email protected]; vinfo_n_f_l_n3=8ba0369be425c0d2.1.7.1505795231863.1507950353704.1508150387844; vjlast=1505795232.1508150167.11; Province=0450; City=0454; _ga=GA1.2.1044198758.1506584097; _gid=GA1.2.763458995.1508907342; JSESSIONID-WYYY=Zm%2FnBG6%2B1vb%2BfJp%5CJP8nIyBZQfABmnAiIqMM8fgXABoqI0PdVq%2FpCsSPDROY1APPaZnFgh14pR2pV9E0Vdv2DaO%2BKkifMncYvxRVlOKMEGzq9dTcC%2F0PI07KWacWqGpwO88GviAmX%2BVuDkIVNBEquDrJ4QKhTZ2dzyGD%2Bd2T%2BbiztinJ%3A1508946396692; _iuqxldmzr_=32; playerid=20572717; MUSIC_U=39d0b2b5e15675f10fd5d9c05e8a5d593c61fcb81368d4431bab029c28eff977d4a57de2f409f533b482feaf99a1b61e80836282123441c67df96e4bf32a71bc38be3a5b629323e7bf122d59fa1ed6a2; __remember_me=true; __csrf=2032a8f34f1f92412a49ba3d6f68b2db; __utma=94650624.1044198758.1506584097.1508939111.1508942690.40; __utmb=94650624.20.10.1508942690; __utmc=94650624; __utmz=94650624.1508394258.18.4.utmcsr=xujin.org|utmccn=(referral)|utmcmd=referral|utmcct=/")
			.method(Method.GET).ignoreContentType(true)
			.timeout(2099999999).execute();
	Document pageJson = execute.parse();
	Element body = pageJson.body();
	List<Node> childNodes = body.childNode(11).childNode(3).childNode(5).childNode(1).childNodes();
	//把前10位的代理IP放到List中
	for(int i = 2;i <= 30;i += 2){
		IpEntity ipEntity = new IpEntity();
		Node node = childNodes.get(i);
		List<Node> nodes = node.childNodes();
		String ip = nodes.get(3).childNode(0).toString();
		int port = Integer.parseInt(nodes.get(5).childNode(0).toString());
		ipEntity.setIp(ip);
		ipEntity.setPort(port);
		ipList.add(ipEntity);
	}
	return ipList;
}
 
Example #25
Source Project: emotional_analysis   Author: 20100507   File: GetSongName.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test1() throws Exception{
	Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + 91445)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "no-cache").timeout(2000000000)
			.execute();
	Document parse = execute.parse();
	Elements elementsByClass = parse.getElementsByClass("f-ff2");
	Element element = elementsByClass.get(0);
	Node childNode = element.childNode(0);
	// 获取歌曲名称
	String songName = childNode.toString();
	System.out.println(songName);
}
 
Example #26
Source Project: NetDiscovery   Author: fengzhizi715   File: CssSelector.java    License: Apache License 2.0 5 votes vote down vote up
protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}
 
Example #27
Source Project: jsoup-learning   Author: code4craft   File: HtmlToPlainText.java    License: MIT License 5 votes vote down vote up
public void tail(Node node, int depth) {
    String name = node.nodeName();
    if (name.equals("br"))
        append("\n");
    else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))
        append("\n\n");
    else if (name.equals("a"))
        append(String.format(" <%s>", node.absUrl("href")));
}
 
Example #28
Source Project: Xndroid   Author: XndroidDev   File: OutputFormatter.java    License: GNU General Public License v3.0 5 votes vote down vote up
private boolean unlikely(Node e) {
    if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
        return true;

    String style = e.attr("style");
    String clazz = e.attr("class");
    return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
}
 
Example #29
Source Project: astor   Author: SpoonLabs   File: HtmlParserTest.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Test public void handleNullContextInParseFragment() {
    String html = "<ol><li>One</li></ol><p>Two</p>";
    List<Node> nodes = Parser.parseFragment(html, null, "http://example.com/");
    assertEquals(1, nodes.size()); // returns <html> node (not document) -- no context means doc gets created
    assertEquals("html", nodes.get(0).nodeName());
    assertEquals("<html> <head></head> <body> <ol> <li>One</li> </ol> <p>Two</p> </body> </html>", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
}
 
Example #30
Source Project: SnowGraph   Author: linzeqipku   File: StackOverflowParser.java    License: Apache License 2.0 5 votes vote down vote up
private static List<CodeInfo> parseHTMLNodeToParagraphs(Node node) {
	List<CodeInfo> paragraphList = new ArrayList<>();
	List<Node> childNodes = node.childNodes();
	for (Node childNode : childNodes) {
		if (childNode.nodeName().equals("p") || childNode.nodeName().equals("li")) continue;
		if (childNode.nodeName().equals("pre"))
			childNode.childNodes().stream()
					.filter(n -> n.nodeName().equals("code"))
					.map(n -> new CodeInfo(StringEscapeUtils.unescapeHtml4(((Element) n).text())))
					.forEach(paragraphList::add);
		else paragraphList.addAll(parseHTMLNodeToParagraphs(childNode));
	}
	return paragraphList;
}