us.codecraft.webmagic.utils.UrlUtils Java Examples
The following examples show how to use
us.codecraft.webmagic.utils.UrlUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Spider.java From webmagic with Apache License 2.0 | 6 votes |
/** * Download urls synchronizing. * * @param urls urls * @param <T> type of process result * @return list downloaded */ public <T> List<T> getAll(Collection<String> urls) { destroyWhenExit = false; spawnUrl = false; if (startRequests!=null){ startRequests.clear(); } for (Request request : UrlUtils.convertToRequests(urls)) { addRequest(request); } CollectorPipeline collectorPipeline = getCollectorPipeline(); pipelines.add(collectorPipeline); run(); spawnUrl = true; destroyWhenExit = true; return collectorPipeline.getCollected(); }
Example #2
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 6 votes |
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) { BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl()))); cookieStore.addCookie(cookie1); } httpContext.setCookieStore(cookieStore); } return httpContext; }
Example #3
Source File: Page.java From webmagic with Apache License 2.0 | 5 votes |
/** * add urls to fetch * * @param requests requests */ public void addTargetRequests(List<String> requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } }
Example #4
Source File: Page.java From webmagic with Apache License 2.0 | 5 votes |
/** * add urls to fetch * * @param requests requests * @param priority priority */ public void addTargetRequests(List<String> requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); } }
Example #5
Source File: Page.java From webmagic with Apache License 2.0 | 5 votes |
/** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); }
Example #6
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 5 votes |
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); if (site != null) { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { for (Map.Entry<String, String> header : request.getHeaders().entrySet()) { httpUriRequest.addHeader(header.getKey(), header.getValue()); } } return httpUriRequest; }
Example #7
Source File: Spider.java From webmagic with Apache License 2.0 | 4 votes |
private void addRequest(Request request) { if (site.getDomain() == null && request != null && request.getUrl() != null) { site.setDomain(UrlUtils.getDomain(request.getUrl())); } scheduler.push(request, this); }
Example #8
Source File: HttpUriRequestConverterTest.java From webmagic with Apache License 2.0 | 4 votes |
@Test public void test_illegal_uri_correct() throws Exception { HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null); assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#")); }
Example #9
Source File: SpiderMonitor.java From webmagic with Apache License 2.0 | 4 votes |
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { // ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); }