Java Code Examples for com.gargoylesoftware.htmlunit.WebRequest#setAdditionalHeader()

The following examples show how to use com.gargoylesoftware.htmlunit.WebRequest#setAdditionalHeader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HtmlEmbed.java    From htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Saves this content as the specified file.
 * @param file the file to save to
 * @throws IOException if an IO error occurs
 */
public void saveAs(final File file) throws IOException {
    final HtmlPage page = (HtmlPage) getPage();
    final WebClient webclient = page.getWebClient();

    final URL url = page.getFullyQualifiedUrl(getAttributeDirect(SRC_ATTRIBUTE));
    final WebRequest request = new WebRequest(url);
    request.setCharset(page.getCharset());
    request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());
    final WebResponse webResponse = webclient.loadWebResponse(request);

    try (OutputStream fos = Files.newOutputStream(file.toPath());
            InputStream content =  webResponse.getContentAsStream()) {
        IOUtils.copy(content, fos);
    }
}
 
Example 2
Source File: HtmlImageInput.java    From htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * <p>Downloads the image contained by this image element.</p>
 * <p><span style="color:red">POTENTIAL PERFORMANCE KILLER - DOWNLOADS THE IMAGE - USE AT YOUR OWN RISK</span></p>
 * <p>If the image has not already been downloaded, this method triggers a download and caches the image.</p>
 *
 * @throws IOException if an error occurs while downloading the image
 */
private void downloadImageIfNeeded() throws IOException {
    if (!downloaded_) {
        // HTMLIMAGE_BLANK_SRC_AS_EMPTY
        final String src = getSrcAttribute();
        if (!"".equals(src)
                && !(hasFeature(HTMLIMAGE_BLANK_SRC_AS_EMPTY) && StringUtils.isBlank(src))) {
            final HtmlPage page = (HtmlPage) getPage();
            final WebClient webClient = page.getWebClient();

            final URL url = page.getFullyQualifiedUrl(src);
            final BrowserVersion browser = webClient.getBrowserVersion();
            final WebRequest request = new WebRequest(url, browser.getImgAcceptHeader(),
                                                            browser.getAcceptEncodingHeader());
            request.setCharset(page.getCharset());
            request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());
            imageWebResponse_ = webClient.loadWebResponse(request);
        }

        downloaded_ = hasFeature(JS_IMAGE_COMPLETE_RETURNS_TRUE_FOR_NO_REQUEST)
                || (imageWebResponse_ != null && imageWebResponse_.getContentType().contains("image"));
    }
}
 
Example 3
Source File: CsrfIT.java    From krazo with Apache License 2.0 6 votes vote down vote up
/**
 * Checks that CSRF validation works if token sent as header instead of
 * form field.
 *
 * @throws Exception an error occurs or validation fails.
 */
@Test
public void testFormHeaderOk() throws Exception {
    HtmlPage page1 = webClient.getPage(webUrl + "resources/csrf");

    // Check response and CSRF header
    WebResponse res = page1.getWebResponse();
    assertEquals(Response.Status.OK.getStatusCode(), res.getStatusCode());
    assertNotNull(res.getResponseHeaderValue(CSRF_HEADER));

    WebRequest req = new WebRequest(new URL(webUrl + "resources/csrf"));
    req.setHttpMethod(HttpMethod.POST);
    req.setAdditionalHeader(CSRF_HEADER, res.getResponseHeaderValue(CSRF_HEADER));
    res = webClient.loadWebResponse(req);
    assertEquals(Response.Status.OK.getStatusCode(), res.getStatusCode());
}
 
Example 4
Source File: Location.java    From HtmlUnit-Android with Apache License 2.0 6 votes vote down vote up
/**
 * Reloads the current page, possibly forcing retrieval from the server even if
 * the browser cache contains the latest version of the document.
 * @param force if {@code true}, force reload from server; otherwise, may reload from cache
 * @throws IOException if there is a problem reloading the page
 * @see <a href="http://msdn.microsoft.com/en-us/library/ms536342.aspx">MSDN Documentation</a>
 */
@JsxFunction
public void reload(final boolean force) throws IOException {
    final HtmlPage htmlPage = (HtmlPage) getWindow(getStartingScope()).getWebWindow().getEnclosedPage();
    final WebRequest request = htmlPage.getWebResponse().getWebRequest();

    String referer = htmlPage.getUrl().toExternalForm();
    request.setAdditionalHeader(HttpHeader.REFERER, referer);

    referer = UrlUtils.getUrlWithNewQuery(htmlPage.getUrl(), null).toExternalForm();
    referer = StringUtils.stripEnd(referer, "/");
    request.setAdditionalHeader(HttpHeader.ORIGIN, referer);

    final WebWindow webWindow = window_.getWebWindow();
    webWindow.getWebClient().download(webWindow, "", request, true, false, "JS location.reload");
}
 
Example 5
Source File: HtmlArea.java    From HtmlUnit-Android with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
protected boolean doClickStateUpdate(final boolean shiftKey, final boolean ctrlKey) throws IOException {
    final HtmlPage enclosingPage = (HtmlPage) getPage();
    final WebClient webClient = enclosingPage.getWebClient();

    final String href = getHrefAttribute().trim();
    if (!href.isEmpty()) {
        final HtmlPage page = (HtmlPage) getPage();
        if (StringUtils.startsWithIgnoreCase(href, JavaScriptURLConnection.JAVASCRIPT_PREFIX)) {
            page.executeJavaScript(
                href, "javascript url", getStartLineNumber());
            return false;
        }
        final URL url;
        try {
            url = enclosingPage.getFullyQualifiedUrl(getHrefAttribute());
        }
        catch (final MalformedURLException e) {
            throw new IllegalStateException(
                    "Not a valid url: " + getHrefAttribute());
        }
        final WebRequest request = new WebRequest(url);
        request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());
        final WebWindow webWindow = enclosingPage.getEnclosingWindow();
        webClient.getPage(
                webWindow,
                enclosingPage.getResolvedTarget(getTargetAttribute()),
                request);
    }
    return false;
}
 
Example 6
Source File: ProducesIT.java    From krazo with Apache License 2.0 5 votes vote down vote up
@Test
public void locale2() throws Exception {
    final WebRequest wrq = new WebRequest(new URL(webUrl + "resources/locale2"), ACCEPT_HEADER);
    wrq.setAdditionalHeader("Accept-Language", ACCEPT_LANGUAGE);
    final WebResponse wr = webClient.loadWebResponse(wrq);
    assertEquals(Response.Status.OK.getStatusCode(), wr.getStatusCode());
    assertEquals(MediaType.APPLICATION_XHTML_XML, wr.getContentType());
    assertEquals("en-GB", wr.getResponseHeaderValue("Content-Language"));
}
 
Example 7
Source File: TrimouViewEngineTest.java    From trimou with Apache License 2.0 5 votes vote down vote up
@Test
public void testController()
        throws FailingHttpStatusCodeException, IOException {
    WebClient webClient = new WebClient();
    WebRequest request = new WebRequest(
            new URL(contextPath, "resources/simple?user=mike"));
    request.setAdditionalHeader("Accept-Language", "cs");
    Page page = webClient.getPage(request);
    String[] parts = page.getWebResponse().getContentAsString().split(":");
    assertEquals(5, parts.length);
    assertTrue(parts[0].endsWith("/resources/simple"));
    assertEquals("mike", parts[1]);
    assertEquals("cs", parts[2]);
    assertEquals(parts[3], parts[4]);
}
 
Example 8
Source File: ProducesIT.java    From krazo with Apache License 2.0 5 votes vote down vote up
@Test
public void language1() throws Exception {
    final WebRequest wrq = new WebRequest(new URL(webUrl + "resources/language1"), ACCEPT_HEADER);
    wrq.setAdditionalHeader("Accept-Language", ACCEPT_LANGUAGE);
    final WebResponse wr = webClient.loadWebResponse(wrq);
    assertEquals(Response.Status.OK.getStatusCode(), wr.getStatusCode());
    assertEquals(MediaType.APPLICATION_XHTML_XML, wr.getContentType());
    assertEquals("es", wr.getResponseHeaderValue("Content-Language"));
}
 
Example 9
Source File: HtmlLink.java    From HtmlUnit-Android with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the request which will allow us to retrieve the content referenced by the {@code href} attribute.
 * @return the request which will allow us to retrieve the content referenced by the {@code href} attribute
 * @throws MalformedURLException in case of problem resolving the URL
 */
public WebRequest getWebRequest() throws MalformedURLException {
    final HtmlPage page = (HtmlPage) getPage();
    final URL url = page.getFullyQualifiedUrl(getHrefAttribute());

    final WebRequest request = new WebRequest(url);

    request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());

    final String accept = page.getWebClient().getBrowserVersion().getCssAcceptHeader();
    request.setAdditionalHeader(HttpHeader.ACCEPT, accept);

    return request;
}
 
Example 10
Source File: ProducesIT.java    From ozark with Apache License 2.0 5 votes vote down vote up
@Test
public void language1() throws Exception {
    final WebRequest wrq = new WebRequest(new URL(webUrl + "resources/language1"), ACCEPT_HEADER);
    wrq.setAdditionalHeader("Accept-Language", ACCEPT_LANGUAGE);
    final WebResponse wr = webClient.loadWebResponse(wrq);
    assertEquals(Response.Status.OK.getStatusCode(), wr.getStatusCode());
    assertEquals(MediaType.APPLICATION_XHTML_XML, wr.getContentType());
    assertEquals("es", wr.getResponseHeaderValue("Content-Language"));
}
 
Example 11
Source File: HtmlEmbed.java    From HtmlUnit-Android with Apache License 2.0 5 votes vote down vote up
/**
 * Saves this content as the specified file.
 * @param file the file to save to
 * @throws IOException if an IO error occurs
 */
public void saveAs(final File file) throws IOException {
    final HtmlPage page = (HtmlPage) getPage();
    final WebClient webclient = page.getWebClient();

    final URL url = page.getFullyQualifiedUrl(getAttributeDirect("src"));
    final WebRequest request = new WebRequest(url);
    request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());
    final WebResponse webResponse = webclient.loadWebResponse(request);

    try (FileOutputStream fos = new FileOutputStream(file);
            InputStream content =  webResponse.getContentAsStream()) {
        IOUtils.copy(content, fos);
    }
}
 
Example 12
Source File: Location.java    From htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Reloads the current page, possibly forcing retrieval from the server even if
 * the browser cache contains the latest version of the document.
 * @param force if {@code true}, force reload from server; otherwise, may reload from cache
 * @throws IOException if there is a problem reloading the page
 * @see <a href="http://msdn.microsoft.com/en-us/library/ms536342.aspx">MSDN Documentation</a>
 */
@JsxFunction
public void reload(final boolean force) throws IOException {
    final HtmlPage htmlPage = (HtmlPage) getWindow(getStartingScope()).getWebWindow().getEnclosedPage();
    final WebRequest request = htmlPage.getWebResponse().getWebRequest();

    if (getBrowserVersion().hasFeature(JS_LOCATION_RELOAD_REFERRER)) {
        final String referer = htmlPage.getUrl().toExternalForm();
        request.setAdditionalHeader(HttpHeader.REFERER, referer);
    }

    final WebWindow webWindow = window_.getWebWindow();
    webWindow.getWebClient().download(webWindow, "", request, true, false, "JS location.reload");
}
 
Example 13
Source File: ProducesIT.java    From ozark with Apache License 2.0 5 votes vote down vote up
@Test
public void locale1() throws Exception {
    final WebRequest wrq = new WebRequest(new URL(webUrl + "resources/locale1"), ACCEPT_HEADER);
    wrq.setAdditionalHeader("Accept-Language", ACCEPT_LANGUAGE);
    final WebResponse wr = webClient.loadWebResponse(wrq);
    assertEquals(Response.Status.OK.getStatusCode(), wr.getStatusCode());
    assertEquals(MediaType.APPLICATION_XHTML_XML, wr.getContentType());
    assertEquals("en-GB", wr.getResponseHeaderValue("Content-Language"));
}
 
Example 14
Source File: Location.java    From HtmlUnit-Android with Apache License 2.0 5 votes vote down vote up
/**
 * Sets the location URL to an entirely new value.
 * @param newLocation the new location URL
 * @throws IOException if loading the specified location fails
 * @see <a href="http://msdn.microsoft.com/en-us/library/ms533867.aspx">MSDN Documentation</a>
 */
@JsxSetter
public void setHref(final String newLocation) throws IOException {
    final HtmlPage page = (HtmlPage) getWindow(getStartingScope()).getWebWindow().getEnclosedPage();
    if (newLocation.startsWith(JavaScriptURLConnection.JAVASCRIPT_PREFIX)) {
        final String script = newLocation.substring(11);
        page.executeJavaScript(script, "new location value", 1);
        return;
    }
    try {
        URL url = page.getFullyQualifiedUrl(newLocation);
        // fix for empty url
        if (StringUtils.isEmpty(newLocation)) {
            final boolean dropFilename = page.getWebClient().getBrowserVersion().
                    hasFeature(ANCHOR_EMPTY_HREF_NO_FILENAME);
            if (dropFilename) {
                String path = url.getPath();
                path = path.substring(0, path.lastIndexOf('/') + 1);
                url = UrlUtils.getUrlWithNewPath(url, path);
                url = UrlUtils.getUrlWithNewRef(url, null);
            }
            else {
                url = UrlUtils.getUrlWithNewRef(url, null);
            }
        }

        final WebRequest request = new WebRequest(url);
        request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());

        final WebWindow webWindow = window_.getWebWindow();
        webWindow.getWebClient().download(webWindow, "", request, true, false, "JS set location");
    }
    catch (final MalformedURLException e) {
        LOG.error("setHref('" + newLocation + "') got MalformedURLException", e);
        throw e;
    }
}
 
Example 15
Source File: Downloader.java    From MMDownloader with Apache License 2.0 5 votes vote down vote up
/**
 * HtmlUnit을 이용한 HTML 코드 파싱.
 *
 * @param eachArchiveAddress 실제 만화가 담긴 아카이브 주소
 * @return 성공 시 html 코드를 리턴
 */
private String getHtmlPageHtmlUnit(String eachArchiveAddress) throws Exception {
	/* 필수! 로그 메세지 출력 안함 -> HtmlUnit 이용시 Verbose한 로그들이 너무 많아서 다 끔 */
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
	System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");

	print.info("일반 연결 시도중...\n");

	WebClient webClient = new WebClient();
	webClient.getOptions().setRedirectEnabled(true);

	WebRequest req = new WebRequest(new URL(eachArchiveAddress));
	req.setHttpMethod(HttpMethod.POST);
	req.setAdditionalHeader("User-Agent", UserAgent.getUserAgent());
	req.setAdditionalHeader("Accept-Encoding", "gzip"); //20171126 gzip 추가
	req.getRequestParameters().add(new NameValuePair("pass", PASSWORD)); //비밀번호 post 방식 전송

	HtmlPage page = webClient.getPage(req);

	//Html코드를 포함한 페이지 소스코드가 담길 스트링
	String pageSource = page.asXml();

	/** 여기도 페이지 파싱 실패 시 검증하는 코드 들어가야 됨 **/

	webClient.close();
	print.info("일반 연결 성공\n");
	return pageSource;
}
 
Example 16
Source File: RequestLocaleSupportTest.java    From trimou with Apache License 2.0 5 votes vote down vote up
@Test
public void testRequestLocaleSupport()
        throws FailingHttpStatusCodeException, MalformedURLException,
        IOException {
    WebClient webClient = new WebClient();
    WebRequest request = new WebRequest(new URL(contextPath, "test"));
    request.setAdditionalHeader("Accept-Language", "cs");
    TextPage page = webClient.getPage(request);
    assertEquals("cs:::fr", page.getContent());
}
 
Example 17
Source File: Location.java    From htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Sets the location URL to an entirely new value.
 * @param newLocation the new location URL
 * @throws IOException if loading the specified location fails
 * @see <a href="http://msdn.microsoft.com/en-us/library/ms533867.aspx">MSDN Documentation</a>
 */
@JsxSetter
public void setHref(final String newLocation) throws IOException {
    final HtmlPage page = (HtmlPage) getWindow(getStartingScope()).getWebWindow().getEnclosedPage();
    if (newLocation.startsWith(JavaScriptURLConnection.JAVASCRIPT_PREFIX)) {
        final String script = newLocation.substring(11);
        page.executeJavaScript(script, "new location value", 1);
        return;
    }
    try {
        URL url = page.getFullyQualifiedUrl(newLocation);
        // fix for empty url
        if (StringUtils.isEmpty(newLocation)) {
            final boolean dropFilename = page.getWebClient().getBrowserVersion().
                    hasFeature(ANCHOR_EMPTY_HREF_NO_FILENAME);
            if (dropFilename) {
                String path = url.getPath();
                path = path.substring(0, path.lastIndexOf('/') + 1);
                url = UrlUtils.getUrlWithNewPath(url, path);
                url = UrlUtils.getUrlWithNewRef(url, null);
            }
            else {
                url = UrlUtils.getUrlWithNewRef(url, null);
            }
        }

        final WebRequest request = new WebRequest(url);
        request.setAdditionalHeader(HttpHeader.REFERER, page.getUrl().toExternalForm());

        final WebWindow webWindow = window_.getWebWindow();
        webWindow.getWebClient().download(webWindow, "", request, true, false, "JS set location");
    }
    catch (final MalformedURLException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("setHref('" + newLocation + "') got MalformedURLException", e);
        }
        throw e;
    }
}
 
Example 18
Source File: HtmlForm.java    From HtmlUnit-Android with Apache License 2.0 4 votes vote down vote up
/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Gets the request for a submission of this form with the specified SubmittableElement.
 * @param submitElement the element that caused the submit to occur
 * @return the request
 */
public WebRequest getWebRequest(final SubmittableElement submitElement) {
    final HtmlPage htmlPage = (HtmlPage) getPage();
    final List<NameValuePair> parameters = getParameterListForSubmit(submitElement);
    final HttpMethod method;
    final String methodAttribute = getMethodAttribute();
    if ("post".equalsIgnoreCase(methodAttribute)) {
        method = HttpMethod.POST;
    }
    else {
        if (!"get".equalsIgnoreCase(methodAttribute) && StringUtils.isNotBlank(methodAttribute)) {
            notifyIncorrectness("Incorrect submit method >" + getMethodAttribute() + "<. Using >GET<.");
        }
        method = HttpMethod.GET;
    }

    final BrowserVersion browser = getPage().getWebClient().getBrowserVersion();
    String actionUrl = getActionAttribute();
    String anchor = null;
    String queryFromFields = "";
    if (HttpMethod.GET == method) {
        if (actionUrl.contains("#")) {
            anchor = StringUtils.substringAfter(actionUrl, "#");
        }
        final Charset enc = getPage().getCharset();
        queryFromFields =
            URLEncodedUtils.format(Arrays.asList(NameValuePair.toHttpClient(parameters)), enc);

        // action may already contain some query parameters: they have to be removed
        actionUrl = StringUtils.substringBefore(actionUrl, "#");
        actionUrl = StringUtils.substringBefore(actionUrl, "?");
        parameters.clear(); // parameters have been added to query
    }
    URL url;
    try {
        if (actionUrl.isEmpty()) {
            url = WebClient.expandUrl(htmlPage.getUrl(), actionUrl);
        }
        else {
            url = htmlPage.getFullyQualifiedUrl(actionUrl);
        }

        if (!queryFromFields.isEmpty()) {
            url = UrlUtils.getUrlWithNewQuery(url, queryFromFields);
        }

        if (HttpMethod.GET == method && browser.hasFeature(FORM_SUBMISSION_URL_WITHOUT_HASH)
                && WebClient.URL_ABOUT_BLANK != url) {
            url = UrlUtils.getUrlWithNewRef(url, null);
        }
        else if (HttpMethod.POST == method
                && browser.hasFeature(FORM_SUBMISSION_URL_WITHOUT_HASH)
                && WebClient.URL_ABOUT_BLANK != url
                && StringUtils.isEmpty(actionUrl)) {
            url = UrlUtils.getUrlWithNewRef(url, null);
        }
        else if (anchor != null
                && WebClient.URL_ABOUT_BLANK != url) {
            url = UrlUtils.getUrlWithNewRef(url, anchor);
        }
    }
    catch (final MalformedURLException e) {
        throw new IllegalArgumentException("Not a valid url: " + actionUrl);
    }

    final WebRequest request = new WebRequest(url, method);
    request.setAdditionalHeader(HttpHeader.ACCEPT, browser.getHtmlAcceptHeader());
    request.setAdditionalHeader(HttpHeader.ACCEPT_ENCODING, "gzip, deflate");
    request.setRequestParameters(parameters);
    if (HttpMethod.POST == method) {
        request.setEncodingType(FormEncodingType.getInstance(getEnctypeAttribute()));
    }
    request.setCharset(getSubmitCharset());

    String referer = htmlPage.getUrl().toExternalForm();
    request.setAdditionalHeader(HttpHeader.REFERER, referer);

    if (HttpMethod.POST == method
            && browser.hasFeature(FORM_SUBMISSION_HEADER_ORIGIN)) {
        referer = StringUtils.stripEnd(referer, "/");
        request.setAdditionalHeader(HttpHeader.ORIGIN, referer);
    }
    if (HttpMethod.POST == method
            && browser.hasFeature(FORM_SUBMISSION_HEADER_CACHE_CONTROL_MAX_AGE)) {
        request.setAdditionalHeader(HttpHeader.CACHE_CONTROL, "max-age=0");
    }
    if (browser.hasFeature(FORM_SUBMISSION_HEADER_CACHE_CONTROL_NO_CACHE)) {
        request.setAdditionalHeader(HttpHeader.CACHE_CONTROL, "no-cache");
    }

    return request;
}
 
Example 19
Source File: HtmlUnitDownloder.java    From gecco-htmlunit with MIT License 4 votes vote down vote up
public HttpResponse download(HttpRequest request, int timeout) throws DownloadException {
	try {
		URL url = new URL(request.getUrl());
		WebRequest webRequest = new WebRequest(url);
		webRequest.setHttpMethod(HttpMethod.GET);
		if(request instanceof HttpPostRequest) {//post
			HttpPostRequest post = (HttpPostRequest)request;
			webRequest.setHttpMethod(HttpMethod.POST);
			List<NameValuePair> requestParameters = new ArrayList<NameValuePair>();
			for(Map.Entry<String, Object> entry : post.getFields().entrySet()) {
				NameValuePair nvp = new NameValuePair(entry.getKey(), entry.getValue().toString());
				requestParameters.add(nvp);
			}
			webRequest.setRequestParameters(requestParameters);	
		}
		//header
		boolean isMobile = SpiderThreadLocal.get().getEngine().isMobile();
		webRequest.setAdditionalHeader("User-Agent", UserAgent.getUserAgent(isMobile));
		webRequest.setAdditionalHeaders(request.getHeaders());
		//proxy
		HttpHost proxy = Proxys.getProxy();
		if(proxy != null) {
			webRequest.setProxyHost(proxy.getHostName());
			webRequest.setProxyPort(proxy.getPort());
		}
		//timeout
		this.webClient.getOptions().setTimeout(timeout);
		//request,response
		webClient.getPage(webRequest);
		HtmlPage page = webClient.getPage(request.getUrl());
		HttpResponse resp = new HttpResponse();
		WebResponse webResponse = page.getWebResponse();
		int status = webResponse.getStatusCode();
		resp.setStatus(status);
		if(status == 302 || status == 301) {
			String redirectUrl = webResponse.getResponseHeaderValue("Location");
			resp.setContent(UrlUtils.relative2Absolute(request.getUrl(), redirectUrl));
		} else if(status == 200) {
			String content = page.asXml();
			resp.setContent(content);
			resp.setRaw(webResponse.getContentAsStream());
			String contentType = webResponse.getContentType();
			resp.setContentType(contentType);
			String charset = getCharset(request.getCharset(), contentType);
			resp.setCharset(charset);
		} else {
			throw new DownloadException("ERROR : " + status);
		}
		return resp;
	} catch(Exception ex) {
		throw new DownloadException(ex);
	}
}
 
Example 20
Source File: HtmlUnitPageLoader.java    From xxl-crawler with GNU General Public License v3.0 4 votes vote down vote up
@Override
public Document load(PageRequest pageRequest) {
    if (!UrlUtil.isUrl(pageRequest.getUrl())) {
        return null;
    }

    WebClient webClient = new WebClient();
    try {
        WebRequest webRequest = new WebRequest(new URL(pageRequest.getUrl()));

        // 请求设置
        webClient.getOptions().setUseInsecureSSL(true);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setDoNotTrackEnabled(false);
        webClient.getOptions().setUseInsecureSSL(!pageRequest.isValidateTLSCertificates());

        if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
            for (Map.Entry<String, String> paramItem : pageRequest.getParamMap().entrySet()) {
                webRequest.getRequestParameters().add(new NameValuePair(paramItem.getKey(), paramItem.getValue()));
            }
        }
        if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
            webClient.getCookieManager().setCookiesEnabled(true);
            for (Map.Entry<String, String> cookieItem : pageRequest.getCookieMap().entrySet()) {
                webClient.getCookieManager().addCookie(new Cookie("", cookieItem.getKey(), cookieItem.getValue()));
            }
        }
        if (pageRequest.getHeaderMap() != null && !pageRequest.getHeaderMap().isEmpty()) {
            webRequest.setAdditionalHeaders(pageRequest.getHeaderMap());
        }
        if (pageRequest.getUserAgent() != null) {
            webRequest.setAdditionalHeader("User-Agent", pageRequest.getUserAgent());
        }
        if (pageRequest.getReferrer() != null) {
            webRequest.setAdditionalHeader("Referer", pageRequest.getReferrer());
        }

        webClient.getOptions().setTimeout(pageRequest.getTimeoutMillis());
        webClient.setJavaScriptTimeout(pageRequest.getTimeoutMillis());
        webClient.waitForBackgroundJavaScript(pageRequest.getTimeoutMillis());

        // 代理
        if (pageRequest.getProxy() != null) {
            InetSocketAddress address = (InetSocketAddress) pageRequest.getProxy().address();
            boolean isSocks = pageRequest.getProxy().type() == Proxy.Type.SOCKS;
            webClient.getOptions().setProxyConfig(new ProxyConfig(address.getHostName(), address.getPort(), isSocks));
        }

        // 发出请求
        if (pageRequest.isIfPost()) {
            webRequest.setHttpMethod(HttpMethod.POST);
        } else {
            webRequest.setHttpMethod(HttpMethod.GET);
        }
        HtmlPage page = webClient.getPage(webRequest);

        String pageAsXml = page.asXml();
        if (pageAsXml != null) {
            Document html = Jsoup.parse(pageAsXml);
            return html;
        }
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    } finally {
        if (webClient != null) {
            webClient.close();
        }
    }
    return null;
}