us.codecraft.webmagic.Site Java Examples

The following examples show how to use us.codecraft.webmagic.Site. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_request_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            request.addCookie("cookie","cookie-webmagic");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #2
Source File: ContentLengthLimitHttpClientDownloader.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
    if (site == null) {
        return httpClientGenerator.getClient(null, proxy);
    }
    String domain = site.getDomain();
    CloseableHttpClient httpClient = httpClients.get(domain);
    if (httpClient == null) {
        synchronized (this) {
            httpClient = httpClients.get(domain);
            if (httpClient == null) {
                httpClient = httpClientGenerator.getClient(site, proxy);
                httpClients.put(domain, httpClient);
            }
        }
    }
    return httpClient;
}
 
Example #3
Source File: RedisSchedulerTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore("environment depended")
@Test
public void test() {
    Task task = new Task() {
        @Override
        public String getUUID() {
            return "1";
        }

        @Override
        public Site getSite() {
            return null;
        }
    };
    Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
    request.putExtra("1","2");
    redisScheduler.push(request, task);
    Request poll = redisScheduler.poll(task);
    assertThat(poll).isEqualTo(request);

}
 
Example #4
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_download_auth_by_SimpleProxyProvider() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password")));
            Request request = new Request();
            request.setUrl("http://www.baidu.com");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #5
Source File: ConfigurablePageProcessorTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {
    List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
    ExtractRule extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//title");
    extractRule.setFieldName("title");
    extractRules.add(extractRule);
    extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
    extractRule.setFieldName("star");
    extractRules.add(extractRule);
    ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
            .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
    assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
    assertThat(resultItems.getAll()).containsEntry("star", " 86 ");

}
 
Example #6
Source File: CasperjsDownloader.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}
 
Example #7
Source File: ContentLengthLimitHttpClientDownloader.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) {
    if (site == null) {
        return httpClientGenerator.getClient(null, proxy);
    }
    String domain = site.getDomain();
    CloseableHttpClient httpClient = httpClients.get(domain);
    if (httpClient == null) {
        synchronized (this) {
            httpClient = httpClients.get(domain);
            if (httpClient == null) {
                httpClient = httpClientGenerator.getClient(site, proxy);
                httpClients.put(domain, httpClient);
            }
        }
    }
    return httpClient;
}
 
Example #8
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_site_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
            Page page = httpClientDownloader.download(request, site.toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #9
Source File: CommonSpider.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
public MyPageProcessor(SpiderInfo info, Task task) {
    this.site = Site.me().setDomain(info.getDomain()).setTimeOut(info.getTimeout())
            .setRetryTimes(info.getRetry()).setSleepTime(info.getSleep())
            .setCharset(StringUtils.isBlank(info.getCharset()) ? null : info.getCharset())
            .setUserAgent(info.getUserAgent());
    //设置抓取代理IP与接口
    if (StringUtils.isNotBlank(info.getProxyHost()) && info.getProxyPort() > 0) {
        this.site.setHttpProxy(new HttpHost(info.getProxyHost(), info.getProxyPort()));
        //设置代理的认证
        if (StringUtils.isNotBlank(info.getProxyUsername()) && StringUtils.isNotBlank(info.getProxyPassword())) {
            this.site.setUsernamePasswordCredentials(new UsernamePasswordCredentials(info.getProxyUsername(), info.getProxyPassword()));
        }
    }
    this.info = info;
    this.task = task;
}
 
Example #10
Source File: CommonSpider.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
public MyPageProcessor(SpiderInfo info, Task task) {
    this.site = Site.me().setDomain(info.getDomain()).setTimeOut(info.getTimeout())
            .setRetryTimes(info.getRetry()).setSleepTime(info.getSleep())
            .setCharset(StringUtils.isBlank(info.getCharset()) ? null : info.getCharset())
            .setUserAgent(info.getUserAgent());
    //设置抓取代理IP与接口
    if (StringUtils.isNotBlank(info.getProxyHost()) && info.getProxyPort() > 0) {
        this.site.setHttpProxy(new HttpHost(info.getProxyHost(), info.getProxyPort()));
        //设置代理的认证
        if (StringUtils.isNotBlank(info.getProxyUsername()) && StringUtils.isNotBlank(info.getProxyPassword())) {
            this.site.setUsernamePasswordCredentials(new UsernamePasswordCredentials(info.getProxyUsername(), info.getProxyPassword()));
        }
    }
    this.info = info;
    this.task = task;
}
 
Example #11
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_disableCookieManagement() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(not(eq(cookie("cookie"), "cookie-webmagic"))).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            request.addCookie("cookie","cookie-webmagic");
            Page page = httpClientDownloader.download(request, Site.me().setDisableCookieManagement(true).toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #12
Source File: QuickStarter.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    init();
    String key = null;
    key = readKey(key);
    System.out.println("The demo started and will last 20 seconds...");
    //Start spider
    OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync();

    try {
        Thread.sleep(20000);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    System.out.println("The demo stopped!");
    System.out.println("To more usage, try to customize your own Spider!");
    System.exit(0);
}
 
Example #13
Source File: BaiduBaike.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class);
    //single download
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8");
    System.out.println(baike);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);
    for (BaiduBaike resultItemse : resultItemses) {
        System.out.println(resultItemse);
    }
    ooSpider.close();
}
 
Example #14
Source File: SeleniumDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}
 
Example #15
Source File: SeleniumDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
Example #16
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_set_site_header() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(header("header"), "header-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            Page page = httpClientDownloader.download(request, Site.me().addHeader("header","header-webmagic").toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #17
Source File: ModelPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static ModelPageProcessor create(Site site, Class... clazzs) {
    ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site);
    for (Class clazz : clazzs) {
        modelPageProcessor.addPageModel(clazz);
    }
    return modelPageProcessor;
}
 
Example #18
Source File: HttpClientGenerator.java    From plumemo with Apache License 2.0 5 votes vote down vote up
private CloseableHttpClient generateClient(Site site) {
    HttpClientBuilder httpClientBuilder = HttpClients.custom();

    httpClientBuilder.setConnectionManager(connectionManager);
    if (site.getUserAgent() != null) {
        httpClientBuilder.setUserAgent(site.getUserAgent());
    } else {
        httpClientBuilder.setUserAgent("");
    }
    if (site.isUseGzip()) {
        httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {

            @Override
            public void process(
                    HttpRequest request,
                    HttpContext context) throws HttpException, IOException {
                if (!request.containsHeader("Accept-Encoding")) {
                    request.addHeader("Accept-Encoding", "gzip");
                }
            }
        });
    }
    //解决post/redirect/post 302跳转问题
    httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());

    SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
    socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
    socketConfigBuilder.setSoTimeout(site.getTimeOut());
    SocketConfig socketConfig = socketConfigBuilder.build();
    httpClientBuilder.setDefaultSocketConfig(socketConfig);
    connectionManager.setDefaultSocketConfig(socketConfig);
    httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
    generateCookie(httpClientBuilder, site);
    return httpClientBuilder.build();
}
 
Example #19
Source File: HttpClientGenerator.java    From blog-hunter with MIT License 5 votes vote down vote up
private CloseableHttpClient generateClient(Site site) {
    HttpClientBuilder httpClientBuilder = HttpClients.custom();

    httpClientBuilder.setConnectionManager(connectionManager);
    if (site.getUserAgent() != null) {
        httpClientBuilder.setUserAgent(site.getUserAgent());
    } else {
        httpClientBuilder.setUserAgent("");
    }
    if (site.isUseGzip()) {
        httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {

            public void process(
                    final HttpRequest request,
                    final HttpContext context) throws HttpException, IOException {
                if (!request.containsHeader("Accept-Encoding")) {
                    request.addHeader("Accept-Encoding", "gzip");
                }
            }
        });
    }
    //解决post/redirect/post 302跳转问题
    httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());

    SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
    socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
    socketConfigBuilder.setSoTimeout(site.getTimeOut());
    SocketConfig socketConfig = socketConfigBuilder.build();
    httpClientBuilder.setDefaultSocketConfig(socketConfig);
    connectionManager.setDefaultSocketConfig(socketConfig);
    httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
    generateCookie(httpClientBuilder, site);
    return httpClientBuilder.build();
}
 
Example #20
Source File: TomHttpClientGenerator.java    From tom-crawler with Apache License 2.0 5 votes vote down vote up
private CloseableHttpClient generateClient(Site site) {
    HttpClientBuilder httpClientBuilder = HttpClients.custom();

    httpClientBuilder.setConnectionManager(connectionManager);
    if (site.getUserAgent() != null) {
        httpClientBuilder.setUserAgent(site.getUserAgent());
    } else {
        httpClientBuilder.setUserAgent("");
    }
    if (site.isUseGzip()) {
        httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {

            public void process(
                    final HttpRequest request,
                    final HttpContext context) throws HttpException, IOException {
                if (!request.containsHeader("Accept-Encoding")) {
                    request.addHeader("Accept-Encoding", "gzip");
                }
            }
        });
    }
    //解决post/redirect/post 302跳转问题
    httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());

    SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
    socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
    socketConfigBuilder.setSoTimeout(site.getTimeOut());
    SocketConfig socketConfig = socketConfigBuilder.build();
    httpClientBuilder.setDefaultSocketConfig(socketConfig);
    connectionManager.setDefaultSocketConfig(socketConfig);
    httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
    generateCookie(httpClientBuilder, site);
    return httpClientBuilder.build();
}
 
Example #21
Source File: GithubRepoTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test() {
    OOSpider.create(Site.me().setSleepTime(0)
            , new PageModelPipeline<GithubRepo>() {
        @Override
        public void process(GithubRepo o, Task task) {
            assertThat(o.getStar()).isEqualTo(86);
            assertThat(o.getFork()).isEqualTo(70);
        }
    }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #22
Source File: HunterProcessor.java    From blog-hunter with MIT License 5 votes vote down vote up
@Override
public Site getSite() {
    Site site = Site.me()
            .setCharset(config.getCharset())
            .setDomain(config.getDomain())
            .setUserAgent(config.getUa())
            .setSleepTime(config.getSleepTime())
            .setRetryTimes(config.getRetryTimes())
            .setCycleRetryTimes(config.getCycleRetryTimes());

    //添加抓包获取的cookie信息
    List<Cookie> cookies = config.getCookies();
    if (CollectionUtils.isNotEmpty(cookies)) {
        for (Cookie cookie : cookies) {
            if (StringUtils.isEmpty(cookie.getDomain())) {
                site.addCookie(cookie.getName(), cookie.getValue());
                continue;
            }
            site.addCookie(cookie.getDomain(), cookie.getName(), cookie.getValue());
        }
    }
    //添加请求头,有些网站会根据请求头判断该请求是由浏览器发起还是由爬虫发起的
    Map<String, String> headers = config.getHeaders();
    if (MapUtils.isNotEmpty(headers)) {
        Set<Map.Entry<String, String>> entrySet = headers.entrySet();
        for (Map.Entry<String, String> entry : entrySet) {
            site.addHeader(entry.getKey(), entry.getValue());
        }
    }
    return site;
}
 
Example #23
Source File: Kr36NewsModel.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
 
Example #24
Source File: OOSpider.java    From webmagic with Apache License 2.0 5 votes vote down vote up
/**
 * create a spider
 *
 * @param site site
 * @param pageModelPipeline pageModelPipeline
 * @param pageModels pageModels
 */
public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) {
    this(ModelPageProcessor.create(site, pageModels));
    this.modelPipeline = new ModelPipeline();
    super.addPipeline(modelPipeline);
    for (Class pageModel : pageModels) {
        if (pageModelPipeline != null) {
            this.modelPipeline.put(pageModel, pageModelPipeline);
        }
        pageModelClasses.add(pageModel);
    }
}
 
Example #25
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_download_fail() {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask();
    Request request = new Request(PAGE_ALWAYS_NOT_EXISTS);
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isFalse();
}
 
Example #26
Source File: IteyeBlogProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Site getSite() {
    if (site == null) {
        site = Site.me().setDomain("yanghaoli.iteye.com");
    }
    return site;
}
 
Example #27
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
    RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
    if (site.getHeaders() != null) {
        for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
            requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
        }
    }

    RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
    if (site != null) {
        requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
                .setSocketTimeout(site.getTimeOut())
                .setConnectTimeout(site.getTimeOut())
                .setCookieSpec(CookieSpecs.STANDARD);
    }

    if (proxy != null) {
        requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
    }
    requestBuilder.setConfig(requestConfigBuilder.build());
    HttpUriRequest httpUriRequest = requestBuilder.build();
    if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
        for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
            httpUriRequest.addHeader(header.getKey(), header.getValue());
        }
    }
    return httpUriRequest;
}
 
Example #28
Source File: HttpClientGenerator.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private CloseableHttpClient generateClient(Site site) {
    HttpClientBuilder httpClientBuilder = HttpClients.custom();

    httpClientBuilder.setConnectionManager(connectionManager);
    if (site.getUserAgent() != null) {
        httpClientBuilder.setUserAgent(site.getUserAgent());
    } else {
        httpClientBuilder.setUserAgent("");
    }
    if (site.isUseGzip()) {
        httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {

            public void process(
                    final HttpRequest request,
                    final HttpContext context) throws HttpException, IOException {
                if (!request.containsHeader("Accept-Encoding")) {
                    request.addHeader("Accept-Encoding", "gzip");
                }
            }
        });
    }
    //解决post/redirect/post 302跳转问题
    httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());

    SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();
    socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);
    socketConfigBuilder.setSoTimeout(site.getTimeOut());
    SocketConfig socketConfig = socketConfigBuilder.build();
    httpClientBuilder.setDefaultSocketConfig(socketConfig);
    connectionManager.setDefaultSocketConfig(socketConfig);
    httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
    generateCookie(httpClientBuilder, site);
    return httpClientBuilder.build();
}
 
Example #29
Source File: GooglePlayProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Site getSite() {
	if (null == site) {
		site = Site.me().setDomain("play.google.com").setSleepTime(300);
	}
	return site;
}
 
Example #30
Source File: AppStore.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
    System.out.println(appStore.trackName);
    System.out.println(appStore.description);
    System.out.println(appStore.userRatingCount);
    System.out.println(appStore.screenshotUrls);
    System.out.println(appStore.supportedDevices);
}