Java Code Examples for com.gargoylesoftware.htmlunit.html.HtmlPage#asXml()

The following examples show how to use com.gargoylesoftware.htmlunit.html.HtmlPage#asXml() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: WebClient8Test.java From htmlunit with Apache License 2.0

6 votes

/**
 * @throws Exception if something goes wrong
 */
@Test
public void cloneNode() throws Exception {
    final String html = "<html>\n"
            + "<head><title>foo</title></head>\n"
            + "<body>\n"
            + "<p>hello world</p>\n"
            + "</body></html>";

    try (WebClient webClient = new WebClient(getBrowserVersion(), false, null, -1)) {
        final HtmlPage page = loadPage(webClient, html, null, URL_FIRST);

        final String org = page.asXml();

        final HtmlPage clonedPage = page.cloneNode(true);
        final String clone = clonedPage.asXml();

        assertEquals(org, clone);
    }
}

Example 2

Source File: EpgCrawler.java From MyTv with Apache License 2.0

6 votes

@Override
public List<TvStation> crawlAllTvStation() {
	String epgFile = getCrawlFilePath();
	File file = new File(epgFile);
	String html = null;
	if (file.exists()) {
		try {
			html = MyTvUtils.readAsHtml(epgFile);
			return parseTvStation(html);
		} catch (IOException e) {
			// do nothing
		}
		return null;
	}
	HtmlPage htmlPage = (HtmlPage) WebCrawler.crawl(getUrl());
	html = htmlPage.asXml();
	MyTvUtils.outputCrawlData(getCrawlerName(), html, getCrawlFileName());
	List<TvStation> stationList = parseTvStation(html);
	for (CrawlEventListener listener : listeners) {
		listener.crawlEnd(new AllTvStationCrawlEndEvent(this, stationList));
	}
	return stationList;
}

Example 3

Source File: HTMLTableElement2Test.java From htmlunit with Apache License 2.0

5 votes

/**
 * @throws Exception if the test fails
 */
@Test
public void width() throws Exception {
    final String content
        = "<html><head></head><body>\n"
            + "<table id='tableID' style='background:blue'><tr><td></td></tr></table>\n"
            + "<script language='javascript'>\n"
            + "    var table = document.getElementById('tableID');\n"
            + "    table.width = '200';\n"
            + "</script></body></html>";

    final HtmlPage page = loadPage(content);
    final String xml = page.asXml();
    assertTrue(xml.contains("width=\"200\""));
}

Example 4

Source File: Downloader.java From MMDownloader with Apache License 2.0

5 votes

/**
 * HtmlUnit을 이용한 HTML 코드 파싱.
 *
 * @param eachArchiveAddress 실제 만화가 담긴 아카이브 주소
 * @return 성공 시 html 코드를 리턴
 */
private String getHtmlPageHtmlUnit(String eachArchiveAddress) throws Exception {
	/* 필수! 로그 메세지 출력 안함 -> HtmlUnit 이용시 Verbose한 로그들이 너무 많아서 다 끔 */
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
	System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");

	print.info("일반 연결 시도중...\n");

	WebClient webClient = new WebClient();
	webClient.getOptions().setRedirectEnabled(true);

	WebRequest req = new WebRequest(new URL(eachArchiveAddress));
	req.setHttpMethod(HttpMethod.POST);
	req.setAdditionalHeader("User-Agent", UserAgent.getUserAgent());
	req.setAdditionalHeader("Accept-Encoding", "gzip"); //20171126 gzip 추가
	req.getRequestParameters().add(new NameValuePair("pass", PASSWORD)); //비밀번호 post 방식 전송

	HtmlPage page = webClient.getPage(req);

	//Html코드를 포함한 페이지 소스코드가 담길 스트링
	String pageSource = page.asXml();

	/** 여기도 페이지 파싱 실패 시 검증하는 코드 들어가야 됨 **/

	webClient.close();
	print.info("일반 연결 성공\n");
	return pageSource;
}

Example 5

Source File: htmlunitTest.java From crawler-jsoup-maven with Apache License 2.0

5 votes

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
    
    // 屏蔽HtmlUnit等系统 log
    LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
    java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
    java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
    
    String url = "https://www.newsmth.net/nForum/#!section/Estate";
    System.out.println("Loading page now-----------------------------------------------: "+url);
    
    /* HtmlUnit 模拟浏览器 */
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setJavaScriptEnabled(true);              // 启用JS解释器，默认为true  
    webClient.getOptions().setCssEnabled(false);                    // 禁用css支持  
    webClient.getOptions().setThrowExceptionOnScriptError(false);   // js运行错误时，是否抛出异常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    webClient.getOptions().setTimeout(10 * 1000);                   // 设置连接超时时间
    HtmlPage page = webClient.getPage(url);
    webClient.waitForBackgroundJavaScript(30 * 1000);               // 等待js后台执行30秒

    String pageAsXml = page.asXml();
    
    /* Jsoup解析处理 */
    // Document doc = Jsoup.parse(pageAsXml, "https://bluetata.com/");
    Document doc = Jsoup.parse(pageAsXml);  
    //Elements pngs = doc.select("img[src$=.png]");                   // 获取所有图片元素集
    
    Elements eles = doc.select("td.title_1");
    // 其他操作
    System.out.println(eles.toString());
}

Example 6

Source File: TvMaoCrawler.java From MyTv with Apache License 2.0

5 votes

/**
 * 解析指定城市下的电视台
 * 
 * @param htmlPage
 * @param city
 *            所属城市
 * @return
 */
private List<TvStation> getTvStations(HtmlPage htmlPage, String city) {
	String html = htmlPage.asXml();
	List<?> elements = htmlPage
			.getByXPath("//div[@class='chlsnav']/div[@class='pbar']/b");
	HtmlBold hb = (HtmlBold) elements.get(0);
	String classify = hb.getTextContent().trim();
	MyTvUtils.outputCrawlData(getCrawlerName(), html,
			getCrawlFileName(city, classify));
	List<TvStation> stationList = parseTvStation(city, html);
	logger.debug("tv station crawled." + stationList);
	return stationList;
}

Example 7

Source File: SentenceExtractor.java From superword with Apache License 2.0

5 votes

public static String getContent2(String url) {
    try{
        LOGGER.debug("url:"+url);
        HtmlPage htmlPage = WEB_CLIENT.getPage(url);
        String html = htmlPage.asXml();
        //LOGGER.debug("html:"+html);
        return html;
    }catch (Exception e) {
        e.printStackTrace();
        LOGGER.error("获取URL："+url+"页面出错", e);
    }
    return "";
}

Example 8

Source File: JsSupporedUrlFetcher.java From seldon-server with Apache License 2.0

5 votes

@Override
public String getUrl(String url) throws Exception {
    long timing_start = System.currentTimeMillis();

    BrowserVersion browserVersion = BrowserVersion.getDefault();
    logger.info("Using user-agent: " + browserVersion.getUserAgent());
    final WebClient webClient = new WebClient(browserVersion);
    webClient.setTimeout(httpGetTimeout);
    final HtmlPage page = webClient.getPage(url);
    long timing_end = System.currentTimeMillis();
    logger.info(String.format("fetched page[%s] in ms[%d]", url, (timing_end - timing_start)));
    return page.asXml();
}

Example 9

Source File: HtmlUnitPageLoader.java From xxl-crawler with GNU General Public License v3.0

4 votes

@Override
public Document load(PageRequest pageRequest) {
    if (!UrlUtil.isUrl(pageRequest.getUrl())) {
        return null;
    }

    WebClient webClient = new WebClient();
    try {
        WebRequest webRequest = new WebRequest(new URL(pageRequest.getUrl()));

        // 请求设置
        webClient.getOptions().setUseInsecureSSL(true);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setDoNotTrackEnabled(false);
        webClient.getOptions().setUseInsecureSSL(!pageRequest.isValidateTLSCertificates());

        if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
            for (Map.Entry<String, String> paramItem : pageRequest.getParamMap().entrySet()) {
                webRequest.getRequestParameters().add(new NameValuePair(paramItem.getKey(), paramItem.getValue()));
            }
        }
        if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
            webClient.getCookieManager().setCookiesEnabled(true);
            for (Map.Entry<String, String> cookieItem : pageRequest.getCookieMap().entrySet()) {
                webClient.getCookieManager().addCookie(new Cookie("", cookieItem.getKey(), cookieItem.getValue()));
            }
        }
        if (pageRequest.getHeaderMap() != null && !pageRequest.getHeaderMap().isEmpty()) {
            webRequest.setAdditionalHeaders(pageRequest.getHeaderMap());
        }
        if (pageRequest.getUserAgent() != null) {
            webRequest.setAdditionalHeader("User-Agent", pageRequest.getUserAgent());
        }
        if (pageRequest.getReferrer() != null) {
            webRequest.setAdditionalHeader("Referer", pageRequest.getReferrer());
        }

        webClient.getOptions().setTimeout(pageRequest.getTimeoutMillis());
        webClient.setJavaScriptTimeout(pageRequest.getTimeoutMillis());
        webClient.waitForBackgroundJavaScript(pageRequest.getTimeoutMillis());

        // 代理
        if (pageRequest.getProxy() != null) {
            InetSocketAddress address = (InetSocketAddress) pageRequest.getProxy().address();
            boolean isSocks = pageRequest.getProxy().type() == Proxy.Type.SOCKS;
            webClient.getOptions().setProxyConfig(new ProxyConfig(address.getHostName(), address.getPort(), isSocks));
        }

        // 发出请求
        if (pageRequest.isIfPost()) {
            webRequest.setHttpMethod(HttpMethod.POST);
        } else {
            webRequest.setHttpMethod(HttpMethod.GET);
        }
        HtmlPage page = webClient.getPage(webRequest);

        String pageAsXml = page.asXml();
        if (pageAsXml != null) {
            Document html = Jsoup.parse(pageAsXml);
            return html;
        }
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    } finally {
        if (webClient != null) {
            webClient.close();
        }
    }
    return null;
}

Example 10

Source File: HtmlUnitDownloder.java From gecco-htmlunit with MIT License

4 votes

public HttpResponse download(HttpRequest request, int timeout) throws DownloadException {
	try {
		URL url = new URL(request.getUrl());
		WebRequest webRequest = new WebRequest(url);
		webRequest.setHttpMethod(HttpMethod.GET);
		if(request instanceof HttpPostRequest) {//post
			HttpPostRequest post = (HttpPostRequest)request;
			webRequest.setHttpMethod(HttpMethod.POST);
			List<NameValuePair> requestParameters = new ArrayList<NameValuePair>();
			for(Map.Entry<String, Object> entry : post.getFields().entrySet()) {
				NameValuePair nvp = new NameValuePair(entry.getKey(), entry.getValue().toString());
				requestParameters.add(nvp);
			}
			webRequest.setRequestParameters(requestParameters);	
		}
		//header
		boolean isMobile = SpiderThreadLocal.get().getEngine().isMobile();
		webRequest.setAdditionalHeader("User-Agent", UserAgent.getUserAgent(isMobile));
		webRequest.setAdditionalHeaders(request.getHeaders());
		//proxy
		HttpHost proxy = Proxys.getProxy();
		if(proxy != null) {
			webRequest.setProxyHost(proxy.getHostName());
			webRequest.setProxyPort(proxy.getPort());
		}
		//timeout
		this.webClient.getOptions().setTimeout(timeout);
		//request,response
		webClient.getPage(webRequest);
		HtmlPage page = webClient.getPage(request.getUrl());
		HttpResponse resp = new HttpResponse();
		WebResponse webResponse = page.getWebResponse();
		int status = webResponse.getStatusCode();
		resp.setStatus(status);
		if(status == 302 || status == 301) {
			String redirectUrl = webResponse.getResponseHeaderValue("Location");
			resp.setContent(UrlUtils.relative2Absolute(request.getUrl(), redirectUrl));
		} else if(status == 200) {
			String content = page.asXml();
			resp.setContent(content);
			resp.setRaw(webResponse.getContentAsStream());
			String contentType = webResponse.getContentType();
			resp.setContentType(contentType);
			String charset = getCharset(request.getCharset(), contentType);
			resp.setCharset(charset);
		} else {
			throw new DownloadException("ERROR : " + status);
		}
		return resp;
	} catch(Exception ex) {
		throw new DownloadException(ex);
	}
}