package org.jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import org.jsoup.helper.DataUtil; import org.jsoup.helper.HttpConnection; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; /** The core public access point to the jsoup functionality. @author Jonathan Hedley */ public class Jsoup { private Jsoup() {} /** Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. @param html HTML to parse @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a {@code <base href>} tag. @return sane HTML */ public static Document parse(String html, String baseUri) { return Parser.parse(html, baseUri); } /** Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML (non-HTML) parser. @param html HTML to parse @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a {@code <base href>} tag. @param parser alternate {@link Parser#xmlParser() parser} to use. @return sane HTML */ public static Document parse(String html, String baseUri, Parser parser) { return parser.parseInput(html, baseUri); } /** Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a {@code <base href>} tag. @param html HTML to parse @return sane HTML @see #parse(String, String) */ public static Document parse(String html) { return Parser.parse(html, ""); } /** * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. * <p> * Use examples: * <ul> * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li> * </ul> * @param url URL to connect to. The protocol must be {@code http} or {@code https}. * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. */ public static Connection connect(String url) { return HttpConnection.connect(url); } /** Parse the contents of a file as HTML. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(File in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. @param in file to load HTML from @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. @see #parse(File, String, String) */ public static Document parse(File in, String charsetName) throws IOException { return DataUtil.load(in, charsetName, in.getAbsolutePath()); } /** Read an input stream, and parse it to a Document. @param in input stream to read. Make sure to close it after parsing. @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML (non-HTML) parser. @param in input stream to read. Make sure to close it after parsing. @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if present, or fall back to {@code UTF-8} (which is often safe to do). @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. @param parser alternate {@link Parser#xmlParser() parser} to use. @return sane HTML @throws IOException if the file could not be found, or read, or if the charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { return DataUtil.load(in, charsetName, baseUri, parser); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body HTML fragment @param baseUri URL to resolve relative URLs against. @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { return Parser.parseBodyFragment(bodyHtml, baseUri); } /** Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. @param bodyHtml body HTML fragment @return sane HTML document @see Document#body() */ public static Document parseBodyFragment(String bodyHtml) { return Parser.parseBodyFragment(bodyHtml, ""); } /** Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. <p> The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. @return The parsed HTML. @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored @throws java.net.SocketTimeoutException if the connection times out @throws IOException if a connection or read error occurs @see #connect(String) */ public static Document parse(URL url, int timeoutMillis) throws IOException { Connection con = HttpConnection.connect(url); con.timeout(timeoutMillis); return con.get(); } /** Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted tags and attributes. @param bodyHtml input untrusted HTML (body fragment) @param baseUri URL to resolve relative URLs against @param whitelist white-list of permitted HTML elements @return safe HTML (body fragment) @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, baseUri); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); return clean.body().html(); } /** Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted tags and attributes. @param bodyHtml input untrusted HTML (body fragment) @param whitelist white-list of permitted HTML elements @return safe HTML (body fragment) @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, Whitelist whitelist) { return clean(bodyHtml, "", whitelist); } /** * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of * permitted tags and attributes. * <p>The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an * existing document. If you want to clean full documents, use {@link Cleaner#clean(Document)} instead, and add * structural tags (<code>html, head, body</code> etc) to the whitelist. * * @param bodyHtml input untrusted HTML (body fragment) * @param baseUri URL to resolve relative URLs against * @param whitelist white-list of permitted HTML elements * @param outputSettings document output settings; use to control pretty-printing and entity escape modes * @return safe HTML (body fragment) * @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, String baseUri, Whitelist whitelist, Document.OutputSettings outputSettings) { Document dirty = parseBodyFragment(bodyHtml, baseUri); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); clean.outputSettings(outputSettings); return clean.body().html(); } /** Test if the input body HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. <p>The input HTML should still be run through the cleaner to set up enforced attributes, and to tidy the output. <p>Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) @param bodyHtml HTML to test @param whitelist whitelist to test against @return true if no tags or attributes were removed; false otherwise @see #clean(String, org.jsoup.safety.Whitelist) */ public static boolean isValid(String bodyHtml, Whitelist whitelist) { return new Cleaner(whitelist).isValidBodyHtml(bodyHtml); } }