us.codecraft.webmagic.utils.UrlUtils Java Examples

The following examples show how to use us.codecraft.webmagic.utils.UrlUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Spider.java    From webmagic with Apache License 2.0 6 votes vote down vote up
/**
 * Download urls synchronizing.
 *
 * @param urls urls
 * @param <T> type of process result
 * @return list downloaded
 */
public <T> List<T> getAll(Collection<String> urls) {
    destroyWhenExit = false;
    spawnUrl = false;
    if (startRequests!=null){
        startRequests.clear();
    }
    for (Request request : UrlUtils.convertToRequests(urls)) {
        addRequest(request);
    }
    CollectorPipeline collectorPipeline = getCollectorPipeline();
    pipelines.add(collectorPipeline);
    run();
    spawnUrl = true;
    destroyWhenExit = true;
    return collectorPipeline.getCollected();
}
 
Example #2
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
    HttpClientContext httpContext = new HttpClientContext();
    if (proxy != null && proxy.getUsername() != null) {
        AuthState authState = new AuthState();
        authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
        httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
    }
    if (request.getCookies() != null && !request.getCookies().isEmpty()) {
        CookieStore cookieStore = new BasicCookieStore();
        for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
            BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
            cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
            cookieStore.addCookie(cookie1);
        }
        httpContext.setCookieStore(cookieStore);
    }
    return httpContext;
}
 
Example #3
Source File: Page.java    From webmagic with Apache License 2.0 5 votes vote down vote up
/**
 * add urls to fetch
 *
 * @param requests requests
 */
public void addTargetRequests(List<String> requests) {
    for (String s : requests) {
        if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
            continue;
        }
        s = UrlUtils.canonicalizeUrl(s, url.toString());
        targetRequests.add(new Request(s));
    }
}
 
Example #4
Source File: Page.java    From webmagic with Apache License 2.0 5 votes vote down vote up
/**
 * add urls to fetch
 *
 * @param requests requests
 * @param priority priority
 */
public void addTargetRequests(List<String> requests, long priority) {
    for (String s : requests) {
        if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
            continue;
        }
        s = UrlUtils.canonicalizeUrl(s, url.toString());
        targetRequests.add(new Request(s).setPriority(priority));
    }
}
 
Example #5
Source File: Page.java    From webmagic with Apache License 2.0 5 votes vote down vote up
/**
 * add url to fetch
 *
 * @param requestString requestString
 */
public void addTargetRequest(String requestString) {
    if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
        return;
    }
    requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
    targetRequests.add(new Request(requestString));
}
 
Example #6
Source File: HttpUriRequestConverter.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
    RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
    if (site.getHeaders() != null) {
        for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
            requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
        }
    }

    RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
    if (site != null) {
        requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
                .setSocketTimeout(site.getTimeOut())
                .setConnectTimeout(site.getTimeOut())
                .setCookieSpec(CookieSpecs.STANDARD);
    }

    if (proxy != null) {
        requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme()));
    }
    requestBuilder.setConfig(requestConfigBuilder.build());
    HttpUriRequest httpUriRequest = requestBuilder.build();
    if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
        for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
            httpUriRequest.addHeader(header.getKey(), header.getValue());
        }
    }
    return httpUriRequest;
}
 
Example #7
Source File: Spider.java    From webmagic with Apache License 2.0 4 votes vote down vote up
private void addRequest(Request request) {
    if (site.getDomain() == null && request != null && request.getUrl() != null) {
        site.setDomain(UrlUtils.getDomain(request.getUrl()));
    }
    scheduler.push(request, this);
}
 
Example #8
Source File: HttpUriRequestConverterTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Test
public void test_illegal_uri_correct() throws Exception {
    HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null);
    assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#"));
}
 
Example #9
Source File: SpiderMonitor.java    From webmagic with Apache License 2.0 4 votes vote down vote up
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
//        ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
        ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
        mbeanServer.registerMBean(spiderStatus, objName);
    }