package org.jsoup.integration; import org.jsoup.HttpStatusException; import org.jsoup.UnsupportedMimeTypeException; import org.junit.Test; import org.junit.Ignore; import static org.junit.Assert.*; import org.jsoup.nodes.Document; import org.jsoup.Jsoup; import org.jsoup.Connection; import java.net.MalformedURLException; import java.net.URL; import java.io.IOException; import java.util.Map; /** Tests the URL connection. Not enabled by default, so tests don't require network connection. @author Jonathan Hedley, [email protected] */ @Ignore // ignored by default so tests don't require network access. comment out to enable. public class UrlConnectTest { private static String echoURL = "http://direct.infohound.net/tools/q.pl"; @Test public void fetchURl() throws IOException { String url = "http://www.google.com"; // no trailing / to force redir Document doc = Jsoup.parse(new URL(url), 10*1000); assertTrue(doc.title().contains("Google")); } @Test public void fetchBaidu() throws IOException { Connection.Response res = Jsoup.connect("http://www.baidu.com/").timeout(10*1000).execute(); Document doc = res.parse(); assertEquals("GBK", doc.outputSettings().charset().displayName()); assertEquals("GBK", res.charset()); assert(res.hasCookie("BAIDUID")); assertEquals("text/html;charset=gbk", res.contentType()); } @Test public void exceptOnUnknownContentType() { String url = "http://jsoup.org/rez/osi_logo.png"; // not text/* but image/png, should throw boolean threw = false; try { Document doc = Jsoup.parse(new URL(url), 3000); } catch (UnsupportedMimeTypeException e) { threw = true; assertEquals("org.jsoup.UnsupportedMimeTypeException: Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml. Mimetype=image/png, URL=http://jsoup.org/rez/osi_logo.png", e.toString()); assertEquals(url, e.getUrl()); assertEquals("image/png", e.getMimeType()); } catch (IOException e) { } assertTrue(threw); } @Test public void exceptOnUnsupportedProtocol(){ String url = "file://etc/passwd"; boolean threw = false; try { Document doc = Jsoup.connect(url).get(); } catch (MalformedURLException e) { threw = true; assertEquals("java.net.MalformedURLException: Only http & https protocols supported", e.toString()); } catch (IOException e) { } assertTrue(threw); } @Test public void ignoresContentTypeIfSoConfigured() throws IOException { Document doc = Jsoup.connect("http://jsoup.org/rez/osi_logo.png").ignoreContentType(true).get(); assertEquals("", doc.title()); // this will cause an ugly parse tree } @Test public void doesPost() throws IOException { Document doc = Jsoup.connect(echoURL) .data("uname", "Jsoup", "uname", "Jonathan", "百", "度一下") .cookie("auth", "token") .post(); assertEquals("POST", ihVal("REQUEST_METHOD", doc)); //assertEquals("gzip", ihVal("HTTP_ACCEPT_ENCODING", doc)); // current proxy removes gzip on post assertEquals("auth=token", ihVal("HTTP_COOKIE", doc)); assertEquals("度一下", ihVal("百", doc)); assertEquals("Jsoup, Jonathan", ihVal("uname", doc)); } @Test public void doesGet() throws IOException { Connection con = Jsoup.connect(echoURL + "?what=the") .userAgent("Mozilla") .referrer("http://example.com") .data("what", "about & me?"); Document doc = con.get(); assertEquals("what=the&what=about+%26+me%3F", ihVal("QUERY_STRING", doc)); assertEquals("the, about & me?", ihVal("what", doc)); assertEquals("Mozilla", ihVal("HTTP_USER_AGENT", doc)); assertEquals("http://example.com", ihVal("HTTP_REFERER", doc)); } private static String ihVal(String key, Document doc) { return doc.select("th:contains("+key+") + td").first().text(); } @Test public void followsTempRedirect() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302.pl"); // http://jsoup.org Document doc = con.get(); assertTrue(doc.title().contains("jsoup")); } @Test public void postRedirectsFetchWithGet() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302.pl") .data("Argument", "Riposte") .method(Connection.Method.POST); Connection.Response res = con.execute(); assertEquals("http://jsoup.org", res.url().toExternalForm()); assertEquals(Connection.Method.GET, res.method()); } @Test public void followsRedirectToHttps() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-secure.pl"); // https://www.google.com con.data("id", "5"); Document doc = con.get(); assertTrue(doc.title().contains("Google")); } @Test public void followsRelativeRedirect() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-rel.pl"); // to ./ - /tools/ Document doc = con.post(); assertTrue(doc.title().contains("HTML Tidy Online")); } @Test public void throwsExceptionOnError() { String url = "http://direct.infohound.net/tools/404"; Connection con = Jsoup.connect(url); boolean threw = false; try { Document doc = con.get(); } catch (HttpStatusException e) { threw = true; assertEquals("org.jsoup.HttpStatusException: HTTP error fetching URL. Status=404, URL=http://direct.infohound.net/tools/404", e.toString()); assertEquals(url, e.getUrl()); assertEquals(404, e.getStatusCode()); } catch (IOException e) { } assertTrue(threw); } @Test public void ignoresExceptionIfSoConfigured() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/404").ignoreHttpErrors(true); Connection.Response res = con.execute(); Document doc = res.parse(); assertEquals(404, res.statusCode()); assertEquals("404 Not Found", doc.select("h1").first().text()); } @Test public void doesntRedirectIfSoConfigured() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302.pl").followRedirects(false); Connection.Response res = con.execute(); assertEquals(302, res.statusCode()); assertEquals("http://jsoup.org", res.header("Location")); } @Test public void redirectsResponseCookieToNextResponse() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-cookie.pl"); Connection.Response res = con.execute(); assertEquals("asdfg123", res.cookie("token")); // confirms that cookies set on 1st hit are presented in final result Document doc = res.parse(); assertEquals("uid=jhy; token=asdfg123", ihVal("HTTP_COOKIE", doc)); // confirms that redirected hit saw cookie } @Test public void maximumRedirects() { boolean threw = false; try { Document doc = Jsoup.connect("http://direct.infohound.net/tools/loop.pl").get(); } catch (IOException e) { assertTrue(e.getMessage().contains("Too many redirects")); threw = true; } assertTrue(threw); } @Test public void multiCookieSet() throws IOException { Connection con = Jsoup.connect("http://direct.infohound.net/tools/302-cookie.pl"); Connection.Response res = con.execute(); // test cookies set by redirect: Map<String, String> cookies = res.cookies(); assertEquals("asdfg123", cookies.get("token")); assertEquals("jhy", cookies.get("uid")); // send those cookies into the echo URL by map: Document doc = Jsoup.connect(echoURL).cookies(cookies).get(); assertEquals("uid=jhy; token=asdfg123", ihVal("HTTP_COOKIE", doc)); } @Test public void handlesDodgyCharset() throws IOException { // tests that when we get back "UFT8", that it is recognised as unsupported, and falls back to default instead String url = "http://direct.infohound.net/tools/bad-charset.pl"; Connection.Response res = Jsoup.connect(url).execute(); assertEquals("text/html; charset=UFT8", res.header("Content-Type")); // from the header assertEquals(null, res.charset()); // tried to get from header, not supported, so returns null Document doc = res.parse(); // would throw an error if charset unsupported assertTrue(doc.text().contains("Hello!")); assertEquals("UTF-8", res.charset()); // set from default on parse } @Test public void maxBodySize() throws IOException { String url = "http://direct.infohound.net/tools/large.html"; // 280 K Connection.Response defaultRes = Jsoup.connect(url).execute(); Connection.Response smallRes = Jsoup.connect(url).maxBodySize(50 * 1024).execute(); // crops Connection.Response mediumRes = Jsoup.connect(url).maxBodySize(200 * 1024).execute(); // crops Connection.Response largeRes = Jsoup.connect(url).maxBodySize(300 * 1024).execute(); // does not crop Connection.Response unlimitedRes = Jsoup.connect(url).maxBodySize(0).execute(); int actualString = 280735; assertEquals(actualString, defaultRes.body().length()); assertEquals(50 * 1024, smallRes.body().length()); assertEquals(200 * 1024, mediumRes.body().length()); assertEquals(actualString, largeRes.body().length()); assertEquals(actualString, unlimitedRes.body().length()); int actualDocText = 269541; assertEquals(actualDocText, defaultRes.parse().text().length()); assertEquals(49165, smallRes.parse().text().length()); assertEquals(196577, mediumRes.parse().text().length()); assertEquals(actualDocText, largeRes.parse().text().length()); assertEquals(actualDocText, unlimitedRes.parse().text().length()); } }