Java Code Examples for org.jsoup.nodes.Element#outerHtml()

The following examples show how to use org.jsoup.nodes.Element#outerHtml() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JsoupTesting.java    From Java-Data-Science-Cookbook with MIT License 6 votes vote down vote up
public void extractDataWithJsoup(String href){
	Document doc = null;
	try {
		doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get();
	} catch (IOException e) {
		//Your exception handling here
	}
	if(doc != null){
		String title = doc.title();
		String text = doc.body().text();
		Elements links = doc.select("a[href]");
		for (Element link : links) {
			String linkHref = link.attr("href");
			String linkText = link.text();
			String linkOuterHtml = link.outerHtml(); 
			String linkInnerHtml = link.html();
		}
	}
}
 
Example 2
Source File: CssQueryMethodInterceptor.java    From mica with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Nullable
private String getValue(@Nullable Element element, CssQuery cssQuery) {
	if (element == null) {
		return null;
	}
	// 读取的属性名
	String attrName = cssQuery.attr();
	// 读取的值
	String attrValue;
	if (StringUtil.isBlank(attrName)) {
		attrValue = element.outerHtml();
	} else if ("html".equalsIgnoreCase(attrName)) {
		attrValue = element.html();
	} else if ("text".equalsIgnoreCase(attrName)) {
		attrValue = getText(element);
	} else if ("allText".equalsIgnoreCase(attrName)) {
		attrValue = element.text();
	} else {
		attrValue = element.attr(attrName);
	}
	// 判断是否需要正则处理
	String regex = cssQuery.regex();
	if (StringUtil.isBlank(attrValue) || StringUtil.isBlank(regex)) {
		return attrValue;
	}
	// 处理正则表达式
	return getRegexValue(regex, cssQuery.regexGroup(), attrValue);
}
 
Example 3
Source File: ScriptFinder.java    From burp-javascript-security-extension with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Take the HTML this object has and find all of the scripts within it
 */
private void getScriptsFromHtml(){
    Document doc = Jsoup.parse(html);
    for (Element jsElement : doc.getElementsByTag("script")){
        if (jsElement.hasAttr("src")){
            String scriptSrc = conditionReceivedUrl(jsElement.attr("src"), url);
            String scriptTag = jsElement.outerHtml();
            JavascriptResource scriptObject = new JavascriptResource(myCallbacks, scriptSrc, scriptTag);
            htmlScriptData.put(scriptSrc, scriptObject);
            htmlScripts.add(scriptSrc);
        }
    }
}
 
Example 4
Source File: CssSelector.java    From NetDiscovery with Apache License 2.0 5 votes vote down vote up
private String getValue(Element element) {
    if (attrName == null) {
        return element.outerHtml();
    } else if ("innerHtml".equalsIgnoreCase(attrName)) {
        return element.html();
    } else if ("text".equalsIgnoreCase(attrName)) {
        return getText(element);
    } else if ("allText".equalsIgnoreCase(attrName)) {
        return element.text();
    } else {
        return element.attr(attrName);
    }
}
 
Example 5
Source File: HtmlField.java    From jspoon with MIT License 5 votes vote down vote up
private <U> String getValue(Element node, Class<U> fieldType) {
    if (node == null) {
        return spec.getDefaultValue();
    }
    String value;
    switch (spec.getAttribute()) {
    case "":
    case "text":
        value = node.text();
        break;
    case "html":
    case "innerHtml":
        value = node.html();
        break;
    case "outerHtml":
        value = node.outerHtml();
        break;
    default:
        value = node.attr(spec.getAttribute());
        break;
    }
    if (spec.getRegex() != null) {
        Pattern pattern = Pattern.compile(spec.getRegex());
        Matcher matcher = pattern.matcher(value);
        if (matcher.find()) {
            value = (matcher.groupCount() > 0) ? matcher.group(1) : spec.getDefaultValue();
            if (value == null || value.isEmpty()) {
                value = spec.getDefaultValue();
            }
        }
    }
    return value;
}
 
Example 6
Source File: ElementOperator.java    From zongtui-webcrawler with GNU General Public License v2.0 5 votes vote down vote up
protected String getSource(Element element) {
    if (attribute == null) {
        return element.outerHtml();
    } else {
        String attr = element.attr(attribute);
        Validate.notNull(attr, "Attribute " + attribute + " of " + element + " is not exist!");
        return attr;
    }
}
 
Example 7
Source File: CssSelector.java    From zongtui-webcrawler with GNU General Public License v2.0 5 votes vote down vote up
private String getValue(Element element) {
    if (attrName == null) {
        return element.outerHtml();
    } else if ("innerHtml".equalsIgnoreCase(attrName)) {
        return element.html();
    } else if ("text".equalsIgnoreCase(attrName)) {
        return getText(element);
    } else if ("allText".equalsIgnoreCase(attrName)) {
        return element.text();
    } else {
        return element.attr(attrName);
    }
}
 
Example 8
Source File: ZeppelinRDisplay.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
private static RDisplay htmlDisplay(Element body, String imageWidth) {
  String div = "";
  for (Element element : body.children()) {
    String eHtml = element.html();
    String eOuterHtml = element.outerHtml();

    eOuterHtml = eOuterHtml.replace("“%html " , "").replace("”", "");

    Matcher matcher = pattern.matcher(eHtml);
    if (matcher.matches()) {
      eOuterHtml = eOuterHtml.replace(matcher.group(), "");
    }

    div = div + eOuterHtml;
  }

  String content =  div
    .replaceAll("src=\"//", "src=\"http://")
    .replaceAll("href=\"//", "href=\"http://");

  body.html(content);

  for (Element image : body.getElementsByTag("img")) {
    image.attr("width", imageWidth);
  }

  return new RDisplay(body.html(), Type.HTML, Code.SUCCESS);
}
 
Example 9
Source File: ContentExtractor.java    From WebCollector with GNU General Public License v3.0 5 votes vote down vote up
protected String getTime(Element contentElement) throws Exception {
    String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
    Pattern pattern = Pattern.compile(regex);
    Element current = contentElement;
    for (int i = 0; i < 2; i++) {
        if (current != null && current != doc.body()) {
            Element parent = current.parent();
            if (parent != null) {
                current = parent;
            }
        }
    }
    for (int i = 0; i < 6; i++) {
        if (current == null) {
            break;
        }
        String currentHtml = current.outerHtml();
        Matcher matcher = pattern.matcher(currentHtml);
        if (matcher.find()) {
            return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
        }
        if (current != doc.body()) {
            current = current.parent();
        }
    }

    try {
        return getDate(contentElement);
    } catch (Exception ex) {
        throw new Exception("time not found");
    }

}
 
Example 10
Source File: ContentExtractor.java    From WebCollector with GNU General Public License v3.0 5 votes vote down vote up
protected String getDate(Element contentElement) throws Exception {
    String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
    Pattern pattern = Pattern.compile(regex);
    Element current = contentElement;
    for (int i = 0; i < 2; i++) {
        if (current != null && current != doc.body()) {
            Element parent = current.parent();
            if (parent != null) {
                current = parent;
            }
        }
    }
    for (int i = 0; i < 6; i++) {
        if (current == null) {
            break;
        }
        String currentHtml = current.outerHtml();
        Matcher matcher = pattern.matcher(currentHtml);
        if (matcher.find()) {
            return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
        }
        if (current != doc.body()) {
            current = current.parent();
        }
    }
    throw new Exception("date not found");
}
 
Example 11
Source File: JsoupParserIntegrationTest.java    From tutorials with MIT License 5 votes vote down vote up
@Test
public void examplesExtracting() {
    Element firstArticle = doc.select("article")
        .first();
    Element timeElement = firstArticle.select("time")
        .first();
    String dateTimeOfFirstArticle = timeElement.attr("datetime");
    Element sectionDiv = firstArticle.select("section div")
        .first();
    String sectionDivText = sectionDiv.text();
    String articleHtml = firstArticle.html();
    String outerHtml = firstArticle.outerHtml();
}
 
Example 12
Source File: ElementOperator.java    From xsoup with MIT License 5 votes vote down vote up
protected String getSource(Element element) {
    if (attribute == null) {
        return element.outerHtml();
    } else {
        String attr = element.attr(attribute);
        Validate.notNull(attr, "Attribute " + attribute + " of " + element + " is not exist!");
        return attr;
    }
}
 
Example 13
Source File: CssSelector.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private String getValue(Element element) {
    if (attrName == null) {
        return element.outerHtml();
    } else if ("innerHtml".equalsIgnoreCase(attrName)) {
        return element.html();
    } else if ("text".equalsIgnoreCase(attrName)) {
        return getText(element);
    } else if ("allText".equalsIgnoreCase(attrName)) {
        return element.text();
    } else {
        return element.attr(attrName);
    }
}
 
Example 14
Source File: ElementOperator.java    From zongtui-webcrawler with GNU General Public License v2.0 4 votes vote down vote up
@Override
public String operate(Element element) {
    return element.outerHtml();
}
 
Example 15
Source File: ElementOperator.java    From xsoup with MIT License 4 votes vote down vote up
@Override
public String operate(Element element) {
    return element.outerHtml();
}
 
Example 16
Source File: MlMessageParser.java    From symphony-java-client with Apache License 2.0 3 votes vote down vote up
public void parseMessage(String message) throws SymException {

        Document doc = Jsoup.parse(message);
        originalDoc = doc.clone();
        Element elementErrors = doc.body().getElementsByTag("errors").first();


        if (elementErrors != null) {
            if (elementErrors.outerHtml() != null)
                logger.debug("Errors found in message: {}", elementErrors.outerHtml());
        }
        //Lets remove the errors elements
        doc.select("errors").remove();

        elementMessageML = doc.select("messageML").first();

        if(elementMessageML==null)
            elementMessageML = doc.select("div").first();


        if (elementMessageML != null) {
            if (elementMessageML.outerHtml() != null)
                logger.debug("Doc parsed: {}", elementMessageML.outerHtml());
        } else {

            logger.error("Could not parse document for message {}", message);
            throw new SymException("Malformed message");
        }

        textDoc = new StringBuilder();
        stripTags(textDoc, elementMessageML.childNodes());

        textChunks = textDoc.toString().split("\\s+");


    }