/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import java.util.HashMap; import java.util.Map; /** * Helper methods for JSoup node handling * <br> * Based on https://github.com/duongphuhiep/justext/ by Duong Phu-Hiep * * @author Duong Phu-Hiep * @author Omnia Zayed * @author Ivan Habernal */ public class NodeHelper { /** * Returns the nearest common ancestor of node1 and node2 * * @param node1 node 1 * @param node2 node 2 * @return nearest common ancestor node * @throws IllegalStateException if node1 and node2 has no common ancestor * to make sure that node1 and node2 should inside the same document */ public static Node nearestCommonAncestor(Node node1, Node node2) { Node ancestor = node1; while (ancestor != null) { if (isAncestor(ancestor, node2)) { return ancestor; } ancestor = ancestor.parent(); } throw new IllegalStateException("node1 and node2 do not have common ancestor"); } /** * Returns true if node1 is ancestor of node2 or node1 == node2 * * @param node1 node 1 * @param node2 node 2 * @return boolean value */ public static boolean isAncestor(Node node1, Node node2) { if (node1 == node2) { return true; } Node ancestor = node2; while (ancestor != null) { if (ancestor == node1) { return true; } ancestor = ancestor.parent(); } return false; } /** * Returns true if node has a link ancestor * * @param node node * @return boolean value */ public static boolean isLink(Node node) { Node ancestor = node; while (ancestor != null) { if (isLinkTag(ancestor)) { return true; } ancestor = ancestor.parent(); } return false; } public enum TagType { IGNORABLE, INNER_TEXT, BLOCK_LEVEL, BLOCK_LEVEL_CONTENT, BLOCK_LEVEL_TITLE } public static final Map<String, TagType> TAGS_TYPE = new HashMap<>(); static { TAGS_TYPE.put("style", TagType.IGNORABLE); TAGS_TYPE.put("script", TagType.IGNORABLE); TAGS_TYPE.put("option", TagType.IGNORABLE); TAGS_TYPE.put("noscript", TagType.IGNORABLE); TAGS_TYPE.put("embed", TagType.IGNORABLE); TAGS_TYPE.put("applet", TagType.IGNORABLE); TAGS_TYPE.put("link", TagType.IGNORABLE); TAGS_TYPE.put("button", TagType.IGNORABLE); TAGS_TYPE.put("select", TagType.IGNORABLE); TAGS_TYPE.put("inTAGS_TYPE.put", TagType.IGNORABLE); TAGS_TYPE.put("textarea", TagType.IGNORABLE); TAGS_TYPE.put("keygen", TagType.IGNORABLE); TAGS_TYPE.put("blockquote", TagType.BLOCK_LEVEL); TAGS_TYPE.put("caption", TagType.BLOCK_LEVEL); TAGS_TYPE.put("center", TagType.BLOCK_LEVEL); TAGS_TYPE.put("col", TagType.BLOCK_LEVEL); TAGS_TYPE.put("colgroup", TagType.BLOCK_LEVEL); TAGS_TYPE.put("dd", TagType.BLOCK_LEVEL); TAGS_TYPE.put("div", TagType.BLOCK_LEVEL); TAGS_TYPE.put("dl", TagType.BLOCK_LEVEL); TAGS_TYPE.put("dt", TagType.BLOCK_LEVEL); TAGS_TYPE.put("fieldset", TagType.BLOCK_LEVEL); TAGS_TYPE.put("form", TagType.BLOCK_LEVEL); TAGS_TYPE.put("legend", TagType.BLOCK_LEVEL); TAGS_TYPE.put("optgroup", TagType.BLOCK_LEVEL); TAGS_TYPE.put("p", TagType.BLOCK_LEVEL); TAGS_TYPE.put("pre", TagType.BLOCK_LEVEL_CONTENT); TAGS_TYPE.put("table", TagType.BLOCK_LEVEL); TAGS_TYPE.put("td", TagType.BLOCK_LEVEL); TAGS_TYPE.put("tfoot", TagType.BLOCK_LEVEL); TAGS_TYPE.put("th", TagType.BLOCK_LEVEL); TAGS_TYPE.put("thead", TagType.BLOCK_LEVEL); TAGS_TYPE.put("tr", TagType.BLOCK_LEVEL); TAGS_TYPE.put("ul", TagType.BLOCK_LEVEL); TAGS_TYPE.put("ol", TagType.BLOCK_LEVEL); TAGS_TYPE.put("li", TagType.BLOCK_LEVEL); TAGS_TYPE.put("h1", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("h2", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("h3", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("h4", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("h5", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("h6", TagType.BLOCK_LEVEL_TITLE); TAGS_TYPE.put("code", TagType.BLOCK_LEVEL_CONTENT); //main content for sure TAGS_TYPE.put("b", TagType.INNER_TEXT); //count as text inside block TAGS_TYPE.put("u", TagType.INNER_TEXT); //count as text inside block TAGS_TYPE.put("i", TagType.INNER_TEXT);//count as text inside block //the <br><br> is a paragraph separator and should TAGS_TYPE.put("br", TagType.INNER_TEXT); //count as text inside block } public static boolean isInnerText(Node tag) { return !(tag == null || !(tag instanceof Element)) && TAGS_TYPE.get(tag.nodeName()) == TagType.INNER_TEXT; } public static boolean isBlockTag(Node tag) { return !(tag == null || !(tag instanceof Element)) && ((Element) tag).isBlock(); } public static boolean isInlineTag(Node tag) { return !(tag == null || !(tag instanceof Element)) && ((Element) tag).tag().isInline(); } public static boolean isLinkTag(Node elem) { return !(elem == null || !(elem instanceof Element)) && ( "a".equalsIgnoreCase(elem.nodeName()) || "link".equalsIgnoreCase(elem.nodeName())); } }