us.codecraft.webmagic.Site Java Examples
The following examples show how to use
us.codecraft.webmagic.Site.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_request_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #2
Source File: ContentLengthLimitHttpClientDownloader.java From spider with GNU General Public License v3.0 | 6 votes |
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { if (site == null) { return httpClientGenerator.getClient(null, proxy); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site, proxy); httpClients.put(domain, httpClient); } } } return httpClient; }
Example #3
Source File: RedisSchedulerTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore("environment depended") @Test public void test() { Task task = new Task() { @Override public String getUUID() { return "1"; } @Override public Site getSite() { return null; } }; Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); request.putExtra("1","2"); redisScheduler.push(request, task); Request poll = redisScheduler.poll(task); assertThat(poll).isEqualTo(request); }
Example #4
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_download_auth_by_SimpleProxyProvider() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password"))); Request request = new Request(); request.setUrl("http://www.baidu.com"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #5
Source File: ConfigurablePageProcessorTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test() throws Exception { List<ExtractRule> extractRules = new ArrayList<ExtractRule>(); ExtractRule extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//title"); extractRule.setFieldName("title"); extractRules.add(extractRule); extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); extractRule.setFieldName("star"); extractRules.add(extractRule); ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>"); assertThat(resultItems.getAll()).containsEntry("star", " 86 "); }
Example #6
Source File: CasperjsDownloader.java From spider with GNU General Public License v3.0 | 6 votes |
@Override public Page download(Request request, Task task) { String html = null; Site site = null; if (task != null) { site = task.getSite(); } try { html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true)); } catch (Exception e) { if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } request.putExtra("EXCEPTION", e); onError(request); return null; } Page page = new Page(); page.setRawText(html); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); onSuccess(request); return page; }
Example #7
Source File: ContentLengthLimitHttpClientDownloader.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { if (site == null) { return httpClientGenerator.getClient(null, proxy); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site, proxy); httpClients.put(domain, httpClient); } } } return httpClient; }
Example #8
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_site_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1"); Page page = httpClientDownloader.download(request, site.toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #9
Source File: CommonSpider.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
public MyPageProcessor(SpiderInfo info, Task task) { this.site = Site.me().setDomain(info.getDomain()).setTimeOut(info.getTimeout()) .setRetryTimes(info.getRetry()).setSleepTime(info.getSleep()) .setCharset(StringUtils.isBlank(info.getCharset()) ? null : info.getCharset()) .setUserAgent(info.getUserAgent()); //设置抓取代理IP与接口 if (StringUtils.isNotBlank(info.getProxyHost()) && info.getProxyPort() > 0) { this.site.setHttpProxy(new HttpHost(info.getProxyHost(), info.getProxyPort())); //设置代理的认证 if (StringUtils.isNotBlank(info.getProxyUsername()) && StringUtils.isNotBlank(info.getProxyPassword())) { this.site.setUsernamePasswordCredentials(new UsernamePasswordCredentials(info.getProxyUsername(), info.getProxyPassword())); } } this.info = info; this.task = task; }
Example #10
Source File: CommonSpider.java From spider with GNU General Public License v3.0 | 6 votes |
public MyPageProcessor(SpiderInfo info, Task task) { this.site = Site.me().setDomain(info.getDomain()).setTimeOut(info.getTimeout()) .setRetryTimes(info.getRetry()).setSleepTime(info.getSleep()) .setCharset(StringUtils.isBlank(info.getCharset()) ? null : info.getCharset()) .setUserAgent(info.getUserAgent()); //设置抓取代理IP与接口 if (StringUtils.isNotBlank(info.getProxyHost()) && info.getProxyPort() > 0) { this.site.setHttpProxy(new HttpHost(info.getProxyHost(), info.getProxyPort())); //设置代理的认证 if (StringUtils.isNotBlank(info.getProxyUsername()) && StringUtils.isNotBlank(info.getProxyPassword())) { this.site.setUsernamePasswordCredentials(new UsernamePasswordCredentials(info.getProxyUsername(), info.getProxyPassword())); } } this.info = info; this.task = task; }
Example #11
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_disableCookieManagement() throws Exception { HttpServer server = httpServer(13423); server.get(not(eq(cookie("cookie"), "cookie-webmagic"))).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().setDisableCookieManagement(true).toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #12
Source File: QuickStarter.java From webmagic with Apache License 2.0 | 6 votes |
public static void main(String[] args) { init(); String key = null; key = readKey(key); System.out.println("The demo started and will last 20 seconds..."); //Start spider OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync(); try { Thread.sleep(20000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("The demo stopped!"); System.out.println("To more usage, try to customize your own Spider!"); System.exit(0); }
Example #13
Source File: BaiduBaike.java From webmagic with Apache License 2.0 | 6 votes |
public static void main(String[] args) { OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class); //single download String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"); System.out.println(baike); //multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list); for (BaiduBaike resultItemse : resultItemses) { System.out.println(resultItemse); } ooSpider.close(); }
Example #14
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore("need chrome driver") @Test public void test() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) { Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); } System.out.println(System.currentTimeMillis() - time1); }
Example #15
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore @Test public void testBaiduWenku() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); seleniumDownloader.setSleepTime(10000); long time1 = System.currentTimeMillis(); Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); }
Example #16
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_set_site_header() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("header"), "header-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); Page page = httpClientDownloader.download(request, Site.me().addHeader("header","header-webmagic").toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #17
Source File: ModelPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
public static ModelPageProcessor create(Site site, Class... clazzs) { ModelPageProcessor modelPageProcessor = new ModelPageProcessor(site); for (Class clazz : clazzs) { modelPageProcessor.addPageModel(clazz); } return modelPageProcessor; }
Example #18
Source File: HttpClientGenerator.java From plumemo with Apache License 2.0 | 5 votes |
private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { @Override public void process( HttpRequest request, HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); }
Example #19
Source File: HttpClientGenerator.java From blog-hunter with MIT License | 5 votes |
private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); }
Example #20
Source File: TomHttpClientGenerator.java From tom-crawler with Apache License 2.0 | 5 votes |
private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); }
Example #21
Source File: GithubRepoTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test() { OOSpider.create(Site.me().setSleepTime(0) , new PageModelPipeline<GithubRepo>() { @Override public void process(GithubRepo o, Task task) { assertThat(o.getStar()).isEqualTo(86); assertThat(o.getFork()).isEqualTo(70); } }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #22
Source File: HunterProcessor.java From blog-hunter with MIT License | 5 votes |
@Override public Site getSite() { Site site = Site.me() .setCharset(config.getCharset()) .setDomain(config.getDomain()) .setUserAgent(config.getUa()) .setSleepTime(config.getSleepTime()) .setRetryTimes(config.getRetryTimes()) .setCycleRetryTimes(config.getCycleRetryTimes()); //添加抓包获取的cookie信息 List<Cookie> cookies = config.getCookies(); if (CollectionUtils.isNotEmpty(cookies)) { for (Cookie cookie : cookies) { if (StringUtils.isEmpty(cookie.getDomain())) { site.addCookie(cookie.getName(), cookie.getValue()); continue; } site.addCookie(cookie.getDomain(), cookie.getName(), cookie.getValue()); } } //添加请求头,有些网站会根据请求头判断该请求是由浏览器发起还是由爬虫发起的 Map<String, String> headers = config.getHeaders(); if (MapUtils.isNotEmpty(headers)) { Set<Map.Entry<String, String>> entrySet = headers.entrySet(); for (Map.Entry<String, String> entry : entrySet) { site.addHeader(entry.getKey(), entry.getValue()); } } return site; }
Example #23
Source File: Kr36NewsModel.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, JMException { //Just for benchmark Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); }
Example #24
Source File: OOSpider.java From webmagic with Apache License 2.0 | 5 votes |
/** * create a spider * * @param site site * @param pageModelPipeline pageModelPipeline * @param pageModels pageModels */ public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { this(ModelPageProcessor.create(site, pageModels)); this.modelPipeline = new ModelPipeline(); super.addPipeline(modelPipeline); for (Class pageModel : pageModels) { if (pageModelPipeline != null) { this.modelPipeline.put(pageModel, pageModelPipeline); } pageModelClasses.add(pageModel); } }
Example #25
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_download_fail() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask(); Request request = new Request(PAGE_ALWAYS_NOT_EXISTS); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isFalse(); }
Example #26
Source File: IteyeBlogProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Site getSite() { if (site == null) { site = Site.me().setDomain("yanghaoli.iteye.com"); } return site; }
Example #27
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 5 votes |
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); if (site != null) { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { for (Map.Entry<String, String> header : request.getHeaders().entrySet()) { httpUriRequest.addHeader(header.getKey(), header.getValue()); } } return httpUriRequest; }
Example #28
Source File: HttpClientGenerator.java From webmagic with Apache License 2.0 | 5 votes |
private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(""); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader("Accept-Encoding")) { request.addHeader("Accept-Encoding", "gzip"); } } }); } //解决post/redirect/post 302跳转问题 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); }
Example #29
Source File: GooglePlayProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Site getSite() { if (null == site) { site = Site.me().setDomain("play.google.com").setSleepTime(300); } return site; }
Example #30
Source File: AppStore.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); System.out.println(appStore.trackName); System.out.println(appStore.description); System.out.println(appStore.userRatingCount); System.out.println(appStore.screenshotUrls); System.out.println(appStore.supportedDevices); }