us.codecraft.webmagic.proxy.Proxy Java Examples

The following examples show how to use us.codecraft.webmagic.proxy.Proxy. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ContentLengthLimitHttpClientDownloader.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
    if (site == null) {
        return httpClientGenerator.getClient(null, proxy);
    }
    String domain = site.getDomain();
    CloseableHttpClient httpClient = httpClients.get(domain);
    if (httpClient == null) {
        synchronized (this) {
            httpClient = httpClients.get(domain);
            if (httpClient == null) {
                httpClient = httpClientGenerator.getClient(site, proxy);
                httpClients.put(domain, httpClient);
            }
        }
    }
    return httpClient;
}
 
Example #2
Source File: ContentLengthLimitHttpClientDownloader.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
    if (site == null) {
        return httpClientGenerator.getClient(null, proxy);
    }
    String domain = site.getDomain();
    CloseableHttpClient httpClient = httpClients.get(domain);
    if (httpClient == null) {
        synchronized (this) {
            httpClient = httpClients.get(domain);
            if (httpClient == null) {
                httpClient = httpClientGenerator.getClient(site, proxy);
                httpClients.put(domain, httpClient);
            }
        }
    }
    return httpClient;
}
 
Example #3
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
    HttpClientContext httpContext = new HttpClientContext();
    if (proxy != null && proxy.getUsername() != null) {
        AuthState authState = new AuthState();
        authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
        httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
    }
    if (request.getCookies() != null && !request.getCookies().isEmpty()) {
        CookieStore cookieStore = new BasicCookieStore();
        for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
            BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
            cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
            cookieStore.addCookie(cookie1);
        }
        httpContext.setCookieStore(cookieStore);
    }
    return httpContext;
}
 
Example #4
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_download_auth_by_SimpleProxyProvider() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password")));
            Request request = new Request();
            request.setUrl("http://www.baidu.com");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #5
Source File: BlogHunterProcessor.java    From blog-hunter with MIT License 5 votes vote down vote up
/**
 * 运行爬虫并返回结果
 *
 * @return
 */
@Override
public CopyOnWriteArrayList<VirtualArticle> execute() {
    List<String> errors = this.validateModel(config);
    if (CollectionUtils.isNotEmpty(errors)) {
        writer.print("校验不通过!请依据下方提示,检查输入参数是否正确......");
        for (String error : errors) {
            writer.print(">> " + error);
        }
        return null;
    }

    CopyOnWriteArrayList<VirtualArticle> virtualArticles = new CopyOnWriteArrayList<>();
    Hunter spider = Hunter.create(this, config, uuid);

    spider.addUrl(config.getEntryUrls().toArray(new String[0]))
            .setScheduler(new BlockingQueueScheduler(config))
            .addPipeline((resultItems, task) -> this.process(resultItems, virtualArticles, spider))
            .setDownloader(new HttpClientDownloader())
            .thread(config.getThreadCount());

    //设置抓取代理IP
    if (!CollectionUtils.isEmpty(config.getProxyList())) {
        HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        SimpleProxyProvider provider = SimpleProxyProvider.from(config.getProxyList().toArray(new Proxy[0]));
        httpClientDownloader.setProxyProvider(provider);
        spider.setDownloader(httpClientDownloader);
    }
    // 测试代理
    /*HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    SimpleProxyProvider provider = SimpleProxyProvider.from(
            new Proxy("61.135.217.7", 80)
    );
    httpClientDownloader.setProxyProvider(provider);
    spider.setDownloader(httpClientDownloader);*/

    // 启动爬虫
    spider.run();
    return virtualArticles;
}
 
Example #6
Source File: HttpClientDownloader.java    From blog-hunter with MIT License 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.debug("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example #7
Source File: HunterConfig.java    From blog-hunter with MIT License 5 votes vote down vote up
public HunterConfig setProxy(String proxyStr) {
    if (this.proxyType != ProxyType.CUSTOM || proxyStr == null) {
        return this;
    }
    String[] proxyArr = proxyStr.split("\r\n");
    for (String s : proxyArr) {
        String[] proxy = s.split("|");
        if (proxy.length == 2) {
            this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1])));
        } else if (proxy.length == 4) {
            this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1]), proxy[2], proxy[3]));
        }
    }
    return this;
}
 
Example #8
Source File: HttpClientDownloader.java    From plumemo with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example #9
Source File: HttpClientDownloader.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example #10
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
    RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
    if (site.getHeaders() != null) {
        for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
            requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
        }
    }

    RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
    if (site != null) {
        requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
                .setSocketTimeout(site.getTimeOut())
                .setConnectTimeout(site.getTimeOut())
                .setCookieSpec(CookieSpecs.STANDARD);
    }

    if (proxy != null) {
        requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
    }
    requestBuilder.setConfig(requestConfigBuilder.build());
    HttpUriRequest httpUriRequest = requestBuilder.build();
    if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
        for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
            httpUriRequest.addHeader(header.getKey(), header.getValue());
        }
    }
    return httpUriRequest;
}
 
Example #11
Source File: HunterConfig.java    From blog-hunter with MIT License 4 votes vote down vote up
private void addProxy(Proxy proxy) {
    if (this.proxyType == ProxyType.CUSTOM || null == proxy) {
        return;
    }
    proxyList.add(proxy);
}
 
Example #12
Source File: CrawlerDownloader.java    From tom-crawler with Apache License 2.0 4 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.debug("downloading page success {}", request.getUrl());
    } catch (IOException e) {
        if (e instanceof ConnectionClosedException) {
            logger.error("Premature end of chunk coded message body: {}", request.getUrl());
        } else if (e instanceof SSLHandshakeException) {
            logger.error("Remote host closed connection during handshake: {}", request.getUrl());
        } else if (e instanceof SSLException) {
            logger.error("SSL peer shut down incorrectly:[HttpClient]  {}", request.getUrl());
        } else if (e instanceof SocketTimeoutException) {
            logger.error("download page time out:{}", request.getUrl());
        } else if (e instanceof NoHttpResponseException) {
            logger.error("failed to respond:{}", request.getUrl());
        } else if (e instanceof HttpHostConnectException) {
            logger.error("Connect to proxy timed out:{}", request.getUrl());
        } else if (e instanceof TruncatedChunkException) {
            logger.error("TruncatedChunkException:{}, msg:{}", request.getUrl(), e.getMessage());
        } else {
            logger.error("download page error:{} ", request.getUrl(), e);
        }
        onError(request);
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
    return page;
}
 
Example #13
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public HttpClientRequestContext convert(Request request, Site site, Proxy proxy) {
    HttpClientRequestContext httpClientRequestContext = new HttpClientRequestContext();
    httpClientRequestContext.setHttpUriRequest(convertHttpUriRequest(request, site, proxy));
    httpClientRequestContext.setHttpClientContext(convertHttpClientContext(request, site, proxy));
    return httpClientRequestContext;
}