Java Code Examples for com.gargoylesoftware.htmlunit.BrowserVersion.CHROME

The following are Jave code examples for showing how to use CHROME of the com.gargoylesoftware.htmlunit.BrowserVersion class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
+ Save this method
Example 1
Project: duck-feed-2   File: DuckScrape.java   View Source Code Vote up 8 votes
public static List<String> searchDuck (String keyword) {
    List<String> searchResults = new ArrayList<>();
    try{
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        HtmlPage page = webClient.getPage("https://duckduckgo.com/html/?q=" + keyword);
        List<HtmlAnchor> l = page.getByXPath("//a[@class='result__url']");
        for(HtmlAnchor a: l) {
            searchResults.add(a.getHrefAttribute());
        }


    }
    catch(Exception e){
        System.err.println(e);
    }
    return searchResults;
}
 
Example 2
Project: Forum-Notifier   File: StatViewController.java   View Source Code Vote up 7 votes
private String getXenToken(final Account account) { // TODO: Re-write this if it ever gets used
    final WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setJavaScriptEnabled(false);

    final HtmlPage page;
    final HtmlInput token;

    account.getCookies().forEach(c -> webClient.getCookieManager().addCookie(c));

    try {
        page = webClient.getPage(account.getForum().getProtocol() + "://" + account.getForum());
        token = page.getFirstByXPath("//*[@id='XenForo']/body/div[1]/aside[2]/div/div/div[1]/div[2]/form/div/input[2]");

        webClient.close();
        return token.getValueAttribute();
    } catch (Exception e) {
        e.printStackTrace();
    }
    webClient.close();
    return null;
}
 
Example 3
Project: pdfdbscrap   File: HtmlUnitExperiments.java   View Source Code Vote up 6 votes
/**
	 * Scraping base experiment. To be used as starting point.
	 *
	 * First things first:
	 * <ol>
	 * <li>
	 * Make sure you're able to retrieve the page. Try again with JS disabled if
	 * the client crashes.
	 * </li>
	 * <li>
	 * Make sure you're actually getting what you want, and that there are no
	 * redirect-, proxy-, or login shenanigans or what not. Have a look at the
	 * retrieved text.
	 * </li>
	 * <li>
	 * Also consider disabling CSS to speed things up a bit.
	 * </li>
	 * </ol>
	 */
	@Test
	public void scrapingBaseExperiment() {
		assertOutputDirectoryExists();

		String url = "http://www.google.com";

		try (WebClient client = new WebClient(BrowserVersion.CHROME)) {
//			client.getOptions().setCssEnabled(false);
//			client.getOptions().setJavaScriptEnabled(false);

			try {
				HtmlPage page = client.getPage(url);
				System.out.println("HtmlPage:");
				System.out.println(page.asText());
				System.out.print("\n");
			} catch (IOException ex) {
				System.err.println("WARNING: failed to visit: " + url);
				ex.printStackTrace(System.err);
			}
		}
	}
 
Example 4
Project: XC2   File: Server.java   View Source Code Vote up 6 votes
public Server()throws IOException{
    browser = new WebClient(BrowserVersion.CHROME);
    problems = new ArrayList<>();
    done = false;
    isSubmitting = false;
    user_name = "XC2";
    pass_word = "donthackme";
    work_list = new ArrayList<>();
    users_data = new HashMap<>();
    application = Executors.newCachedThreadPool();
    //the buffer can be 100 at max (may be adjusted if needed)
    buffer_queue = new LinkedBlockingQueue<>(100);
    
    
    //some browser intitialization to increase efficiency (the cookies part is essential)
    browser.getOptions().setUseInsecureSSL(true);
    browser.getOptions().setJavaScriptEnabled(false);
    browser.getOptions().setCssEnabled(false);
    browser.getOptions().setThrowExceptionOnScriptError(false);
    browser.getCookieManager().setCookiesEnabled(true);
    browser.setAjaxController(new NicelyResynchronizingAjaxController());
}
 
Example 5
Project: xxl-incubator   File: Demo.java   View Source Code Vote up 6 votes
public static void main(String[] args) throws IOException {

        // 浏览器
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setUseInsecureSSL(true);//支持https
        webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
        webClient.getOptions().setCssEnabled(false); // 禁用css支持
        webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
        webClient.getOptions().setTimeout(10000); // 设置连接超时时间 ,这里是10S。如果为0,则无限期等待
        webClient.getOptions().setDoNotTrackEnabled(false);
        webClient.setJavaScriptTimeout(8000);//设置js运行超时时间
        webClient.waitForBackgroundJavaScript(500);//设置页面等待js响应时间,

        // proxy
        //webClient.getOptions().setProxyConfig(new ProxyConfig("IP", 80));

        HtmlPage page = webClient.getPage("http://---");
        String pageXml = page.asXml(); //以xml的形式获取响应文本
        System.out.println(pageXml);

    }
 
Example 6
Project: Forum-Notifier   File: LoginViewController.java   View Source Code Vote up 6 votes
public static Cookie completeCloudflareBrowserCheck(final String url) {
    WebClient completeClient = new WebClient(BrowserVersion.CHROME);
    completeClient.getOptions().setCssEnabled(false);
    completeClient.getOptions().setThrowExceptionOnFailingStatusCode(false);

    final HtmlPage page;
    final HtmlElement submitButton;
    final HtmlForm challengeForm;

    try {
        page = completeClient.getPage(url);
        completeClient.waitForBackgroundJavaScript(5000);

        submitButton = (HtmlElement) page.createElement("button");
        submitButton.setAttribute("type", "submit");

        challengeForm = (HtmlForm) page.getElementById("challenge-form");
        challengeForm.appendChild(submitButton);
        submitButton.click();

        return completeClient.getCookieManager().getCookie("cf_clearance");
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}
 
Example 7
Project: alimama   File: HtmlUnitUtil.java   View Source Code Vote up 6 votes
public static WebClient create() {
	LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
			"org.apache.commons.logging.impl.NoOpLog");
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(
			Level.OFF);
	java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(
			Level.OFF);

	// LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
	// java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);

	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setUseInsecureSSL(true);
	webClient.getOptions().setJavaScriptEnabled(true);
	webClient.getOptions().setThrowExceptionOnScriptError(false);
	webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
	webClient.getOptions().setCssEnabled(true);
	webClient.setAjaxController(new NicelyResynchronizingAjaxController());
	webClient.getOptions().setTimeout(60000);
	webClient.setJavaScriptTimeout(60000);
	webClient.waitForBackgroundJavaScript(120000);
	return webClient;
}
 
Example 8
Project: awesome-agile   File: AwesomeAgileFunctionalTest.java   View Source Code Vote up 6 votes
@Before
public void setUp() throws Exception {
  System.out.println("Fake Google OAuth2 server up at: " + fakeGoogleServer.getEndpoint());
  System.out.println("Fake Hackpad server up at: " + fakeHackpadServer.getEndpoint());
  System.out.println("AwesomeAgile web application up at: " + getEndpoint());
  fakeGoogleServer.setClientId(CLIENT_ID);
  fakeGoogleServer.setClientSecret(CLIENT_SECRET);
  fakeGoogleServer.setRedirectUriPrefixes(
      ImmutableList.of("http://localhost:" + port + "/"));
  fakeGoogleServer.setPerson(createUser());
  fakeHackpadServer.setClientId(HACKPAD_CLIENT_ID);
  fakeHackpadServer.setClientSecret(HACKPAD_CLIENT_SECRET);
  fakeHackpadServer.getHackpads().clear();
  fakeHackpadServer.addHackpad(
      new PadIdentity(DEFINITION_OF_READY_TEMPLATE_ID),
      DEFINITION_OF_READY_CONTENTS);
  fakeHackpadServer.addHackpad(
      new PadIdentity(DEFINITION_OF_DONE_TEMPLATE_ID),
      DEFINITION_OF_DONE_CONTENTS);

  driver = new HtmlUnitDriver(BrowserVersion.CHROME);
  driver.setJavascriptEnabled(true);
}
 
Example 9
Project: awesome-agile   File: AwesomeAgileFunctionalTest.java   View Source Code Vote up 6 votes
@Test
public void testDashboardReopenBrowser() throws Exception {
  LandingPage landingPage = PageFactory.initElements(driver, LandingPage.class);
  landingPage.loginWithGoogle(getEndpoint());
  assertThat(driver.getWindowHandles(), hasSize(1));
  landingPage.createDefinitionOfReady();

  HtmlUnitDriver driverTwo = new HtmlUnitDriver(BrowserVersion.CHROME);
  driverTwo.setJavascriptEnabled(true);

  // Open a completely new browser with no cookies
  // Verify that view button is visible for this same user,
  // and we're able to open his Definition of ready hackpad
  LandingPage landingPageTwo = PageFactory.initElements(driverTwo, LandingPage.class);
  landingPageTwo.loginWithGoogle(getEndpoint());
  landingPageTwo.waitForDefinitionOfReady();
  assertTrue(landingPageTwo.isDefinitionOfReadyViewable());
  landingPageTwo.viewDefinitionOfReady();
  String newWindow = Iterables.getFirst(Sets.difference(
      driverTwo.getWindowHandles(),
      ImmutableSet.of(driverTwo.getWindowHandle())), null);
  driverTwo.switchTo().window(newWindow);
  HackpadPage hackpadPage = PageFactory.initElements(driverTwo, HackpadPage.class);
  assertEquals(DEFINITION_OF_READY_CONTENTS, hackpadPage.getContent());
}
 
Example 10
Project: tool.accelerate.core   File: PageFunctionTest.java   View Source Code Vote up 6 votes
@Ignore
@Test
// TODO: This method of testing does not work for angular, need to find an alternative method of testing
public void techFormTest() {
    final WebClient webClient = new WebClient(BrowserVersion.CHROME);
    HtmlPage page;
    String port = System.getProperty("liberty.test.port");
    try {
        page = webClient.getPage("http://localhost:" + port + "/start/");
        DomElement techForm = page.getElementById("techTable");
        DomElement formBody = techForm.getFirstElementChild();
        int count = formBody.getChildElementCount();
        // We expect there to be more than one child element, otherwise the 
        // javascript has not created the tech table properly.
        assertTrue("Expected more than one element in the tech table, instead found " + count, count > 1);
    } catch (Exception e){
        org.junit.Assert.fail("Caught exception: " + e.getCause().toString());
    } finally {
        webClient.close();
    }
}
 
Example 11
Project: gwt-sl   File: BaseIntegrationTest.java   View Source Code Vote up 6 votes
@Before
public void setupBrowser() {
	webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setCssEnabled(true);
	webClient.setCssErrorHandler(new SilentCssErrorHandler());
	webClient.getOptions().setThrowExceptionOnFailingStatusCode(true);
	webClient.getOptions().setThrowExceptionOnScriptError(true);
	webClient.getOptions().setRedirectEnabled(true);
	webClient.getOptions().setAppletEnabled(false);
	webClient.getOptions().setJavaScriptEnabled(true);
	webClient.getOptions().setPopupBlockerEnabled(true);
	webClient.getOptions().setTimeout(TIMEOUT);
	webClient.getOptions().setPrintContentOnFailingStatusCode(false);
	webClient.setAjaxController(new NicelyResynchronizingAjaxController());
	webClient.setAlertHandler(new AlertHandler() {

		public void handleAlert(Page page, String message) {
			System.err.println("[alert] " + message);
		}

	});
	webClient.waitForBackgroundJavaScript(TIMEOUT);
}
 
Example 12
Project: datasponge   File: SpiderThread.java   View Source Code Vote up 6 votes
/**
 * initializes the WebClient object that will be used to fetch and parse web
 * pages
 *
 * @return new instance of WebClient that can be used to load and parse
 * pages
 */
private WebClient initializeWebClient(boolean minimal) {

    WebClient client = null;
    if (proxy != null && !proxy.trim().isEmpty()) {
        client = new WebClient(BrowserVersion.CHROME, proxy, port);
    } else {
        client = new WebClient(BrowserVersion.CHROME);
    }
    if (minimal) {
        client.getOptions().setAppletEnabled(false);
        client.getOptions().setJavaScriptEnabled(false);
        client.getOptions().setCssEnabled(false);
    }
    return client;
}
 
Example 13
Project: Crawler_Self_Analysis   File: ApiUse.java   View Source Code Vote up 6 votes
public static void main(String[] args) throws Exception {
	//ģ��chorme�������������������޸�BrowserVersion.����
	WebClient  webClient=new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setCssEnabled(false);
	webClient.getOptions().setJavaScriptEnabled(false);
	HtmlPage page=webClient.getPage("http://www.yanyulin.info");
	//�������ָ��������ϻ�ȡ��ǩhed������
	//HtmlDivision div=(HtmlDivision)page.getElementById("hed");
	//System.out.println(div.asXml());
	//ͬ�����Դ�ӡ��hed������,//div��//��ʾ���������ĵ��е�div,������Щdiv
	//����list�У�Ȼ���ȡ��һ��div
	//final HtmlDivision div = (HtmlDivision) page.getByXPath("//div").get(0);
	//System.out.println(div.asXml());
	java.util.List<HtmlAnchor> achList=page.getAnchors();
	for(HtmlAnchor ach:achList){
		System.out.println(ach.getHrefAttribute());
	}
    webClient.closeAllWindows();		
	
	
}
 
Example 14
Project: crawler-jsoup-maven   File: htmlunitTest.java   View Source Code Vote up 5 votes
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
    
    // 屏蔽HtmlUnit等系统 log
    LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
    java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
    java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
    
    String url = "https://www.douyin.com/share/video/6496703951436516621/?mid=6484356820260686606";
    System.out.println("Loading page now-----------------------------------------------: "+url);
    
    /* HtmlUnit 模拟浏览器 */
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setJavaScriptEnabled(true);              // 启用JS解释器,默认为true  
    webClient.getOptions().setCssEnabled(false);                    // 禁用css支持  
    webClient.getOptions().setThrowExceptionOnScriptError(false);   // js运行错误时,是否抛出异常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    webClient.getOptions().setTimeout(10 * 1000);                   // 设置连接超时时间
    HtmlPage page = webClient.getPage(url);
    webClient.waitForBackgroundJavaScript(30 * 1000);               // 等待js后台执行30秒

    String pageAsXml = page.asXml();
    
    /* Jsoup解析处理 */
    // Document doc = Jsoup.parse(pageAsXml, "https://bluetata.com/");
    Document doc = Jsoup.parse(pageAsXml);  
    Elements pngs = doc.select("img[src$=.png]");                   // 获取所有图片元素集
    // 其他操作
    System.out.println(doc.toString());
}
 
Example 15
Project: poe-ladder-tracker-java   File: CSVLinkCreator.java   View Source Code Vote up 5 votes
/**
 * Starts the CSV link creation process.
 * 
 * @throws Exception If there was an error while getting the CSV specific information from the forum thread.
 */
public void create() throws Exception{
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(java.util.logging.Level.OFF);
	
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
    HtmlPage page = (HtmlPage) webClient.getPage(forumThreadLink);
    webClient.waitForBackgroundJavaScriptStartingBefore(waitForJavaScript);
    HtmlAnchor anchor = (HtmlAnchor) page.getByXPath(xPath).get(0);
    csvFileLink = anchor.getHrefAttribute();
    csvFileLink = "http://www.pathofexile.com" + csvFileLink.subSequence(0, csvFileLink.length()-1);
    webClient.close();  
}
 
Example 16
Project: XC2   File: Server.java   View Source Code Vote up 5 votes
public Server()throws IOException{
    browser = new WebClient(BrowserVersion.CHROME);
    problems = new ArrayList<>();
    done = false;
    isSubmitting = false;
    user_name = "XC2";
    pass_word = "donthackme";
    application = Executors.newCachedThreadPool();
    //the buffer can be 100 at max (may be adjusted if needed)
    buffer_queue = new LinkedBlockingQueue<>(100);
    network = new Network_Server(this, this.port, this.max_clients);
    
    //some browser intitialization to increase efficiency (the cookies part is essential)
    browser.getOptions().setUseInsecureSSL(true);
    browser.getOptions().setJavaScriptEnabled(false);
    browser.getOptions().setCssEnabled(false);
    browser.getOptions().setThrowExceptionOnScriptError(false);
    browser.getCookieManager().setCookiesEnabled(true);
    browser.setAjaxController(new NicelyResynchronizingAjaxController());
}
 
Example 17
Project: gecco-htmlunit   File: HtmlUnitDownloder.java   View Source Code Vote up 5 votes
public HtmlUnitDownloder() {
	this.webClient = new WebClient(BrowserVersion.CHROME);
	this.webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
	this.webClient.getOptions().setThrowExceptionOnScriptError(false);
	this.webClient.getOptions().setRedirectEnabled(false);
	this.webClient.getOptions().setCssEnabled(false);
	this.webClient.setJavaScriptTimeout(1000);
	//this.webClient.setJavaScriptErrorListener(new GeccoJavaScriptErrorListener());
}
 
Example 18
Project: JSearcher   File: PostDemo.java   View Source Code Vote up 5 votes
public PostDemo() {
	webClient = new WebClient(BrowserVersion.CHROME);  
	webClient.getOptions().setCssEnabled(false);
	webClient.getOptions().setJavaScriptEnabled(false);
	
	cookieMan = webClient.getCookieManager();
	cookieMan.setCookiesEnabled(true);
}
 
Example 19
Project: JSearcher   File: LoginSimulator.java   View Source Code Vote up 5 votes
public LoginSimulator() {
	webClient = new WebClient(BrowserVersion.CHROME);  
	webClient.getOptions().setCssEnabled(false);
	webClient.getOptions().setJavaScriptEnabled(false);
	cookieMan = webClient.getCookieManager();
	cookieMan.setCookiesEnabled(true);
}
 
Example 20
Project: JSearcher   File: PostDownloader.java   View Source Code Vote up 5 votes
public PostDownloader() {
	webClient = new WebClient(BrowserVersion.CHROME);  
	webClient.getOptions().setCssEnabled(false); // disable css loader
	webClient.getOptions().setJavaScriptEnabled(false); // disable js loader
	cookieMan = webClient.getCookieManager();
	cookieMan.setCookiesEnabled(true); // enable cookie
}
 
Example 21
Project: https-github.com-g0t4-jenkins2-course-spring-boot   File: LocalHostWebConnectionHtmlUnitDriverTests.java   View Source Code Vote up 5 votes
@Test
public void createWithBrowserVersionWhenEnvironmentIsNullWillThrowException()
		throws Exception {
	this.thrown.expect(IllegalArgumentException.class);
	this.thrown.expectMessage("Environment must not be null");
	new LocalHostWebConnectionHtmlUnitDriver(null, BrowserVersion.CHROME);
}
 
Example 22
Project: spring-boot-concourse   File: LocalHostWebConnectionHtmlUnitDriverTests.java   View Source Code Vote up 5 votes
@Test
public void createWithBrowserVersionWhenEnvironmentIsNullWillThrowException()
		throws Exception {
	this.thrown.expect(IllegalArgumentException.class);
	this.thrown.expectMessage("Environment must not be null");
	new LocalHostWebConnectionHtmlUnitDriver(null, BrowserVersion.CHROME);
}
 
Example 23
Project: Takoyaki   File: Staff.java   View Source Code Vote up 5 votes
public Staff(PrefixedLogger logger, int timeout, String encoding, JSONObject accountProperties){
    super(BrowserVersion.CHROME);
    this.getOptions().setTimeout(timeout);
    this.getOptions().setJavaScriptEnabled(false);
    this.getOptions().setCssEnabled(false);

    this.encoding = encoding;
    this.logger = logger;

    Logger.getLogger("org.apache").setLevel(Level.OFF);
    Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);

    if(accountProperties != null) this.login(accountProperties);
}
 
Example 24
Project: WikiKreator   File: WikiEditCalculator.java   View Source Code Vote up 5 votes
private List<String> getEditedLinks() throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
	System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
	WebClient browser=new WebClient(BrowserVersion.CHROME);
	browser.getOptions().setJavaScriptEnabled(false);
	browser.getOptions().setThrowExceptionOnFailingStatusCode(false);
	browser.getOptions().setThrowExceptionOnScriptError(false);
	HtmlPage rawpage;
	rawpage = browser.getPage(ucContributionsURL);
	List<?> elementList = rawpage.getByXPath("//a[@class='mw-contributions-title']");
	elementList=elementList.subList(0, elementList.size()-10); //remove 10 initial edits: for eligibility
	List<String> articlesEdited=new ArrayList<String>();
	for(int i=0;i<elementList.size();i++)
	{
		HtmlAnchor result = (HtmlAnchor) elementList.get(i);
		String value=(result.getAttributeNode("title").getNodeValue());
		if(!articlesEdited.contains(value) && !value.startsWith("Wiki")
				&& !value.startsWith("User talk"))
		{
			articlesEdited.add(value);
		}
	}
	browser.closeAllWindows();
	return articlesEdited;

}
 
Example 25
Project: WikiKreator   File: WikiEditCalculator.java   View Source Code Vote up 5 votes
private void getEditedList() throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	List<String> articleListContributed=getEditedLinks();
	WebClient browser=new WebClient(BrowserVersion.CHROME);
	browser.getOptions().setJavaScriptEnabled(false);
	browser.getOptions().setThrowExceptionOnFailingStatusCode(false);
	browser.getOptions().setThrowExceptionOnScriptError(false);
	String wikiURL="https://en.wikipedia.org/w/index.php?title=";
	for(String s:articleListContributed)
	{//*[@id="pagehistory"]/li[2]
		String finalURL=wikiURL+s.replaceAll(" ", "_")+"&action=history";
		//System.out.println(finalURL);
		HtmlPage rawpage= browser.getPage(wikiURL+s.replaceAll(" ", "_")+"&action=history");
		@SuppressWarnings("unchecked")
		List<DomNode> totalEditsUsers =(List<DomNode>) rawpage.getByXPath("//span[@class='history-user']");
		//System.out.println(getFirstEditIndex(totalEditsUsers));
		@SuppressWarnings("unchecked")
		List<DomNode> sizeArticles = (List<DomNode>) rawpage.getByXPath("//span[@class='history-size']");
		int currentSize=getContentSize(sizeArticles,0);
		int beforeEditSize=getContentSize(sizeArticles, getFirstEditIndex(totalEditsUsers)+1);
		//System.out.println("Change in Size:"+(currentSize-beforeEditSize));
		int changeInSize=(currentSize-beforeEditSize);
		System.out.println(finalURL+"||"+changeInSize+"||"+getFirstEditIndex(totalEditsUsers));
	}


}
 
Example 26
Project: WikiKreator   File: WikiExportSupporter.java   View Source Code Vote up 5 votes
private void extractArticles(List<String> categoryList) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	List<String> articleList=new ArrayList<String>();
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	for(String s:categoryList)
	{
		System.out.println("Running category:"+s+"/"+categoryList.indexOf(s));
		HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
				"addcat=&catname="+s.split(":")[1]+"&wpDownload=");
		String textArea=(page.getElementByName("pages").asText());
		String[] lines = textArea.split(System.getProperty("line.separator"));
		for(String line:lines)
		{
			if(!line.startsWith("Category:") && !articleList.contains(line))
			{
				articleList.add(line);
			}
		}
	}
	
	System.err.println("Total articles:"+articleList.size());
	HtmlPage pageFinal = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport");
	pageFinal.getElementByName("pages").setTextContent(listToString(articleList));
	HtmlForm form = pageFinal.getForms().get(0);
	HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
	String is = button.click().getWebResponse().getContentAsString();
	File f=new File("filename2.xml");
	f.createNewFile();
	FileWriter fw = new FileWriter(f.getAbsoluteFile());
	BufferedWriter bw = new BufferedWriter(fw);
	bw.write(is);
	bw.close();
	
}
 
Example 27
Project: MangaManagerAndDownloader   File: HTMLUnitHelper.java   View Source Code Vote up 5 votes
private static WebClient createWebClient(final String url) {
    WebClient webClient = new WebClient(BrowserVersion.CHROME);

    // http://stackoverflow.com/a/23482615/2246865 by Neil McGuigan
    java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF);

    webClient.setCssErrorHandler(new SilentCssErrorHandler());

    webClient.setIncorrectnessListener(new IncorrectnessListener() {
        @Override
        public void notify(String arg0, Object arg1) {
        }
    });

    webClient.setCookieManager(cookieManager);
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.getOptions().setPrintContentOnFailingStatusCode(false);
    webClient.getOptions().setUseInsecureSSL(true);

    // http://stackoverflow.com/a/14227559/2246865 by Lee
    webClient.setWebConnection(new WebConnectionWrapper(webClient) {
        @Override
        public WebResponse getResponse(final WebRequest request) throws IOException {
            if (request.getUrl().toString().contains(getDomainName(url))) {
                return super.getResponse(request);
            } else {
                return new StringWebResponse("", request.getUrl());
            }
        }
    });

    return webClient;
}
 
Example 28
Project: PkuIntern   File: FetchPage.java   View Source Code Vote up 5 votes
/**
 * 根据提供的URL返回抓取的Page
 * @param url
 * @return
 */
public HtmlPage fetchPage(String url) {
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setJavaScriptEnabled(false);
	webClient.getOptions().setCssEnabled(false);
	try {
		return webClient.getPage(url);
	} catch (Exception e) {
		logger.error(e.getMessage(), e);
		//e.printStackTrace();
	}finally {
		webClient.closeAllWindows();
	}
	return null; 
}
 
Example 29
Project: PkuIntern   File: FetchPage.java   View Source Code Vote up 5 votes
/**
 * 根据提供的URL返回抓取的Page<br/>
 * 提供解析JS的功能
 * @param url
 * @return
 */
public HtmlPage fetchPageEnableJS(String url) {
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setJavaScriptEnabled(true);
	webClient.getOptions().setCssEnabled(false);
	try {
		return webClient.getPage(url);
	} catch (Exception e) {
		logger.error(e.getMessage(), e);
		//e.printStackTrace();
	}finally {
		webClient.closeAllWindows();
	}
	return null; 
}
 
Example 30
Project: redsniff   File: SeleniumTesterFactory.java   View Source Code Vote up 5 votes
@SuppressWarnings("deprecation")
    private BrowserVersion getHtmlUnitBrowserVersion() {
		if (htmlUnitBrowserVersion.equals("FIREFOX_38"))
			return BrowserVersion.FIREFOX_38;
//		else if(htmlUnitBrowserVersion.equals("FIREFOX_10"))
//			return BrowserVersion.FIREFOX_10;
//		else if(htmlUnitBrowserVersion.equals("FIREFOX_17"))
//            return BrowserVersion.FIREFOX_17;
		else if(htmlUnitBrowserVersion.equals("CHROME"))
            return BrowserVersion.CHROME;
		else if(htmlUnitBrowserVersion.equals("INTERNET_EXPLORER_8"))
			return BrowserVersion.INTERNET_EXPLORER_8;
		else
			return BrowserVersion.getDefault();
	}
 
Example 31
Project: MangaManagerAndDownloader   File: HTMLUnitHelper.java   View Source Code Vote up 5 votes
private static WebClient createWebClient(final String url) {
    WebClient webClient = new WebClient(BrowserVersion.CHROME);

    // http://stackoverflow.com/a/23482615/2246865 by Neil McGuigan
    java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF);

    webClient.setCssErrorHandler(new SilentCssErrorHandler());

    webClient.setIncorrectnessListener(new IncorrectnessListener() {
        @Override
        public void notify(String arg0, Object arg1) {
        }
    });

    webClient.setCookieManager(cookieManager);
    webClient.getOptions().setCssEnabled(false);
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.getOptions().setPrintContentOnFailingStatusCode(false);
    webClient.getOptions().setUseInsecureSSL(true);

    // http://stackoverflow.com/a/14227559/2246865 by Lee
    webClient.setWebConnection(new WebConnectionWrapper(webClient) {
        @Override
        public WebResponse getResponse(final WebRequest request) throws IOException {
            if (request.getUrl().toString().contains(getDomainName(url))) {
                return super.getResponse(request);
            } else {
                return new StringWebResponse("", request.getUrl());
            }
        }
    });

    return webClient;
}
 
Example 32
Project: mot-automated-testsuite   File: WebDriverWrapperTest.java   View Source Code Vote up 4 votes
/**
 * Swaps Chrome with HtmlUnit, running in Chrome emulation mode.
 *
 * @return The web driver instance
 */
@Override
protected WebDriver createWebDriver() {
    this.webDriver = new HtmlUnitDriver(BrowserVersion.CHROME);
    return webDriver;
}
 
Example 33
Project: awesome-agile   File: AwesomeAgileFunctionalTest.java   View Source Code Vote up 4 votes
/**
 * Verifies that different users get different dashboards with links to different documents
 * @throws Exception
 */
@Test
public void testSeparateDashboards() throws Exception {
  String templateOne = DEFINITION_OF_READY_CONTENTS + "1";
  String templateTwo = DEFINITION_OF_READY_CONTENTS + "2";

  Person userOne = createUser();
  Person userTwo = createUser();

  // Create hackpad for user #1
  fakeGoogleServer.setPerson(userOne);

  LandingPage landingPage = PageFactory.initElements(driver, LandingPage.class);
  landingPage.loginWithGoogle(getEndpoint());
  fakeHackpadServer.addHackpad(
      new PadIdentity(DEFINITION_OF_READY_TEMPLATE_ID),
      templateOne);
  String currentWindow = driver.getWindowHandle();
  landingPage.createDefinitionOfReady();
  closeWindowsExceptFor(driver, currentWindow);


  // Log in with user #2, there should be no definition of ready on the dashboard
  fakeGoogleServer.setPerson(userTwo);

  HtmlUnitDriver driverTwo = new HtmlUnitDriver(BrowserVersion.CHROME);
  driverTwo.setJavascriptEnabled(true);
  LandingPage landingPageTwo = PageFactory.initElements(driverTwo, LandingPage.class);
  landingPageTwo.loginWithGoogle(getEndpoint());
  assertFalse(landingPageTwo.isDefinitionOfReadyViewable());

  // Create another definition of ready
  fakeHackpadServer.addHackpad(
      new PadIdentity(DEFINITION_OF_READY_TEMPLATE_ID),
      templateTwo);
  String currentWindowTwo = driverTwo.getWindowHandle();
  landingPageTwo.createDefinitionOfReady();
  closeWindowsExceptFor(driverTwo, currentWindowTwo);

  assertDefinitionOfReadyContents(driver, templateOne);

  assertDefinitionOfReadyContents(driverTwo, templateTwo);
}
 
Example 34
Project: WikiKreator   File: XMLWikipediaExtractor.java   View Source Code Vote up 4 votes
public int getAllArticlesUnderCategory(String mainCategoryName) throws Exception
{
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	//webClient.getOptions().
	HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
			"addcat=&catname="+mainCategoryName+"&wpDownload=");
	System.out.println(page.getUrl());
	
	File folderForDumps=new File(dirForWikiDumps+"/"+mainCategoryName+"-xml");
	//FileUtils.deleteQuietly(folderForDumps);
	Thread.sleep(2000);
	folderForDumps.mkdirs();
	HtmlForm form = page.getForms().get(0);
	System.out.println(form.getAttribute("method"));
	System.out.println(page.getTitleText());
	//HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
	//Retrive the page names
	List<String> categoryList=new ArrayList<String>();
	categoryList.add(mainCategoryName);
	List<String> articleNameList=new ArrayList<String>();
	//get content from textarea
	String textArea=(page.getElementByName("pages").asText());
	String[] lines = textArea.split(System.getProperty("line.separator"));
	for(String line:lines)
	{
		if(line.startsWith("Category:") && !line.contains("stub"))
		{
			categoryList.add(line.split(":")[1]);
		}
		else
		{
			articleNameList.add(line);
		}
	}

	System.out.println(categoryList.size());
	categoryList=searchCategoriesUntilDepth(categoryList);
	System.out.println("Printing categories:"+categoryList);
	System.out.println("Exploring "+categoryList.size()+" categories.......");
	extractArticles(categoryList, folderForDumps.getAbsolutePath());//Uncomment this if new run
	//System.out.println(is);
	File folderForProcessedDumps=new File(dirForProcessedWikiXml+"/"+mainCategoryName+"-xml");
	FileUtils.deleteQuietly(folderForProcessedDumps);
	Thread.sleep(2000);
	folderForProcessedDumps.mkdirs();
	File[] xmlFiles=folderForDumps.listFiles();
	for(File f:xmlFiles)
	{
		//System.out.println("PROCESSING XML FILE:"+f.getAbsolutePath());
		processWikiDumpWithPython(f.getAbsolutePath(),
				folderForProcessedDumps.getAbsolutePath()+
				"/"+f.getName().replaceAll("xml", "")+"txt"
									);
	}
	
	writeFilesInText(mainCategoryName);
	LabelContentExtractor lce=new LabelContentExtractor();
	//lce.loadURLs();
	String wikiTxtDir=dirForProcessedWikiTxt+"/"+mainCategoryName+"-txt";
	String urlFile=urlStoreDir+"/"+mainCategoryName+"-url";
	lce.createSectionContents(wikiTxtDir,urlFile, mainCategoryName);
	TopicModelGenerator.generateTopicModels(mainCategoryName);
	int bestNumTopic=PassageClassifier.getBestNumTopic(mainCategoryName);
	//System.out.println(bestNumTopic);
	//TopicInferencer inferencer=TopicModelGenerator.getBestTopicModelInferer(mainCategoryName, bestNumTopic);
	//RandomForest classifier=PassageClassifier.getRFBestClassifier(bestNumTopic, mainCategoryName);
	webClient.closeAllWindows();
	return bestNumTopic;
}
 
Example 35
Project: WikiKreator   File: XMLWikipediaExtractor.java   View Source Code Vote up 4 votes
public void storeStubArticles(String mainCategoryName) throws Exception
{
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	//webClient.getOptions().
	HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
			"addcat=&catname="+mainCategoryName+"&wpDownload=");
	System.out.println(page.getUrl());
	
	File folderForDumps=new File(dirForWikiDumps+"/"+mainCategoryName+"-xml");
	//FileUtils.deleteQuietly(folderForDumps);
	Thread.sleep(2000);
	folderForDumps.mkdirs();
	HtmlForm form = page.getForms().get(0);
	System.out.println(form.getAttribute("method"));
	System.out.println(page.getTitleText());
	//HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
	//Retrive the page names
	List<String> categoryList=new ArrayList<String>();
	categoryList.add(mainCategoryName);
	List<String> articleNameList=new ArrayList<String>();
	//get content from textarea
	String textArea=(page.getElementByName("pages").asText());
	String[] lines = textArea.split(System.getProperty("line.separator"));
	for(String line:lines)
	{
		if(line.startsWith("Category:") && !line.contains("stub"))
		{
			categoryList.add(line.split(":")[1]);
		}
		else
		{
			articleNameList.add(line);
		}
	}

	System.out.println(categoryList.size());
	categoryList=searchCategoriesUntilDepth(categoryList);
	System.out.println("Printing categories:"+categoryList);
	System.out.println("Exploring "+categoryList.size()+" categories.......");
	extractArticles(categoryList, folderForDumps.getAbsolutePath());//Uncomment this if new run
	//System.out.println(is);
	File folderForProcessedDumps=new File(dirForProcessedWikiXml+"/"+mainCategoryName+"-xml");
	FileUtils.deleteQuietly(folderForProcessedDumps);
	Thread.sleep(2000);
	folderForProcessedDumps.mkdirs();
	File[] xmlFiles=folderForDumps.listFiles();
	for(File f:xmlFiles)
	{
		//System.out.println("PROCESSING XML FILE:"+f.getAbsolutePath());
		processWikiDumpWithPython(f.getAbsolutePath(),
				folderForProcessedDumps.getAbsolutePath()+
				"/"+f.getName().replaceAll("xml", "")+"txt"
									);
	}
	
	writeFilesInText(mainCategoryName);
	LabelContentExtractor lce=new LabelContentExtractor();
	//lce.loadURLs();
	String wikiTxtDir=dirForProcessedWikiTxt+"/"+mainCategoryName+"-txt";
	String urlFile=urlStoreDir+"/"+mainCategoryName+"-url";
	lce.createSectionContents(wikiTxtDir,urlFile, mainCategoryName);
	//TopicModelGenerator.generateTopicModels(mainCategoryName);
	//int bestNumTopic=PassageClassifier.getBestNumTopic(mainCategoryName);
	//System.out.println(bestNumTopic);
	//TopicInferencer inferencer=TopicModelGenerator.getBestTopicModelInferer(mainCategoryName, bestNumTopic);
	//RandomForest classifier=PassageClassifier.getRFBestClassifier(bestNumTopic, mainCategoryName);
	webClient.closeAllWindows();
	//return bestNumTopic;
}
 
Example 36
Project: WikiKreator   File: XMLWikipediaExtractor.java   View Source Code Vote up 4 votes
public void getAllStubsUnderCategory(String mainCategoryName) throws Exception
	{
		WebClient webClient = new WebClient(BrowserVersion.CHROME);
		HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
				"addcat=&catname="+mainCategoryName+"&wpDownload=");
		System.out.println(page.getUrl());
		
		File folderForDumps=new File(dirForWikiDumps+"/"+mainCategoryName+"-xml");
		FileUtils.deleteQuietly(folderForDumps);
		Thread.sleep(2000);
		folderForDumps.mkdirs();
		HtmlForm form = page.getForms().get(0);
		System.out.println(form.getAttribute("method"));
		System.out.println(page.getTitleText());
		//HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
		//Retrive the page names
		List<String> categoryList=new ArrayList<String>();
		categoryList.add(mainCategoryName);
		List<String> articleNameList=new ArrayList<String>();
		//get content from textarea
		String textArea=(page.getElementByName("pages").asText());
		String[] lines = textArea.split(System.getProperty("line.separator"));
		for(String line:lines)
		{
			if(line.startsWith("Category:"))// && !line.contains("stub"))
			{
				categoryList.add(line.split(":")[1]);
			}
			else
			{
				articleNameList.add(line);
			}
		}

		System.out.println(categoryList.size());
		//categoryList=searchCategoriesUntilDepth(categoryList);
		System.out.println("Printing categories:"+categoryList);
		System.out.println("Exploring "+categoryList.size()+" categories.......");
		extractArticles(categoryList, folderForDumps.getAbsolutePath());
		//System.out.println(is);
		File folderForProcessedDumps=new File(dirForProcessedWikiXml+"/"+mainCategoryName+"-xml");
		FileUtils.deleteQuietly(folderForProcessedDumps);
		Thread.sleep(2000);
		folderForProcessedDumps.mkdirs();
		File[] xmlFiles=folderForDumps.listFiles();
		for(File f:xmlFiles)
		{
			//System.out.println("PROCESSING XML FILE:"+f.getAbsolutePath());
			processWikiDumpWithPython(f.getAbsolutePath(),
					folderForProcessedDumps.getAbsolutePath()+
					"/"+f.getName().replaceAll("xml", "")+"txt"
										);
		}
		
		writeFilesInText(mainCategoryName);
		LabelContentExtractor lce=new LabelContentExtractor();
		//lce.loadURLs();
		String wikiTxtDir=dirForProcessedWikiTxt+"/"+mainCategoryName+"-txt";
		String urlFile=urlStoreDir+"/"+mainCategoryName+"-url";
		lce.createSectionContents(wikiTxtDir,urlFile, mainCategoryName);
//		TopicModelGenerator.generateTopicModels(mainCategoryName);
//		int bestNumTopic=PassageClassifier.getBestNumTopic(mainCategoryName);
//		System.out.println(bestNumTopic);
//		TopicInferencer inferencer=TopicModelGenerator.getBestTopicModelInferer(mainCategoryName, bestNumTopic);
//		RandomForest classifier=PassageClassifier.getRFBestClassifier(bestNumTopic, mainCategoryName);
		webClient.closeAllWindows();
	}
 
Example 37
Project: WikiKreator   File: XMLWikipediaExtractor.java   View Source Code Vote up 4 votes
public List<String> searchCategoriesUntilDepth(List<String> categoryList) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	int depth=0;
	List<String> visitedCategories=new ArrayList<String>();
	while(depth<=0)
	{
		List<String> tempCatList=new ArrayList<String>();
		WebClient webClient = new WebClient(BrowserVersion.CHROME);
		for(String s:categoryList)
		{
			if(visitedCategories.contains(s))
				continue;
			System.err.println("Running category:"+s+"/"+categoryList.indexOf(s));
			visitedCategories.add(s);
			HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
					"addcat=&catname="+s+"&wpDownload=");
			String textArea=(page.getElementByName("pages").asText());
			String[] lines = textArea.split(System.getProperty("line.separator"));
			//System.out.println(lines.length);
			for(String line:lines)
			{
				if(line.startsWith("Category:") && !line.contains("stub"))
				{
					if(!categoryList.contains(line.split(":")[1]))
					{
						tempCatList.add(line.split(":")[1]);

					}
				}
				else
				{
					//articleNameList.add(line);
				}
			}

		}
		webClient.closeAllWindows();

		for(String l:tempCatList)
		{
			if(!categoryList.contains(l))
				categoryList.add(l);
		}
		depth++;
	}

	return categoryList;

}
 
Example 38
Project: WikiKreator   File: XMLWikipediaExtractor.java   View Source Code Vote up 4 votes
private void extractArticles(List<String> categoryList,
		String folderForXML) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setUseInsecureSSL(true);
	for(String s:categoryList)
	{
		if(s.contains("\""))
			continue;
		s=s.replaceAll("/", "-");
		File f=new File(folderForXML+"/Wikidump."+s+".xml");
		if(f.exists())
			continue;
		//File xmlFile=new File(folderForXML);
		System.out.println("Running category:"+s+"/"+categoryList.indexOf(s));
		HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
				"addcat=&catname="+s+"&wpDownload=");
		String textArea=(page.getElementByName("pages").asText());
		String[] lines = textArea.split(System.getProperty("line.separator"));
		List<String> articleList=new ArrayList<String>();
		for(String line:lines)
		{
			if(!line.startsWith("Category:") && !articleList.contains(line))
			{
				articleList.add(line);
			}
		}
		HtmlPage pageFinal = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport");
		pageFinal.getElementByName("pages").setTextContent(listToString(articleList));
		HtmlForm form = pageFinal.getForms().get(0);
		HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
		String is = button.click().getWebResponse().getContentAsString();
		
				
		
		f.createNewFile();
		//FileWriter fw = new FileWriter(f.getAbsoluteFile());
		BufferedWriter bw = new BufferedWriter((new OutputStreamWriter(new FileOutputStream(f.getAbsoluteFile()),"UTF-8")));
		bw.write(is);
		bw.close();
		System.err.println("Total articles:"+articleList.size()+" processed in category~~"+s);
		
	}
	
	
		
}
 
Example 39
Project: WikiKreator   File: WikiExportSupporter.java   View Source Code Vote up 4 votes
public void getAllArticlesUnderCategory(String categoryName) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
			"addcat=&catname="+categoryName+"&wpDownload=");
	System.out.println(page.getUrl());
	HtmlForm form = page.getForms().get(0);
	System.out.println(form.getAttribute("method"));
	System.out.println(page.getTitleText());
	HtmlSubmitInput button = (HtmlSubmitInput) form.getInputsByValue("Export").get(0);
	//Retrive the page names
	List<String> categoryList=new ArrayList<String>();
	List<String> articleNameList=new ArrayList<String>();
	//get content from textarea
	String textArea=(page.getElementByName("pages").asText());
	String[] lines = textArea.split(System.getProperty("line.separator"));
	//System.out.println(lines.length);
	for(String line:lines)
	{
		if(line.startsWith("Category:") && !line.contains("stubs"))
		{
			categoryList.add(line);
		}
		else
		{
			articleNameList.add(line);
		}
	}

	System.out.println(categoryList.size());
	categoryList=searchCategoriesUntilDepth(categoryList);
	System.out.println("Exploring "+categoryList.size()+" categories.......");
	extractArticles(categoryList);
	System.out.println(button.getAttribute("accesskey"));
			String is = button.click().getWebResponse().getContentAsString();
			File f=new File("filename.xml");
			f.createNewFile();
			FileWriter fw = new FileWriter(f.getAbsoluteFile());
			BufferedWriter bw = new BufferedWriter(fw);
			bw.write(is);
			bw.close();

	//System.out.println(is);
	webClient.closeAllWindows();
}
 
Example 40
Project: WikiKreator   File: WikiExportSupporter.java   View Source Code Vote up 4 votes
public List<String> searchCategoriesUntilDepth(List<String> categoryList) throws FailingHttpStatusCodeException, MalformedURLException, IOException
{
	int depth=0;
	List<String> visitedCategories=new ArrayList<String>();
	while(depth<=0)
	{
		List<String> tempCatList=new ArrayList<String>();
		WebClient webClient = new WebClient(BrowserVersion.CHROME);
		for(String s:categoryList)
		{
			if(visitedCategories.contains(s))
				continue;
			System.err.println("Running category:"+s+"/"+categoryList.indexOf(s));
			visitedCategories.add(s);
			HtmlPage page = webClient.getPage("https://en.wikipedia.org/w/index.php?title=Special%3AExport&" +
					"addcat=&catname="+s.split(":")[1]+"&wpDownload=");
			String textArea=(page.getElementByName("pages").asText());
			String[] lines = textArea.split(System.getProperty("line.separator"));
			//System.out.println(lines.length);
			for(String line:lines)
			{
				if(line.startsWith("Category:"))
				{
					if(!categoryList.contains(line))
					{
						tempCatList.add(line);

					}
				}
				else
				{
					//articleNameList.add(line);
				}
			}

		}
		webClient.closeAllWindows();

		for(String l:tempCatList)
		{
			if(!categoryList.contains(l))
				categoryList.addAll(tempCatList);
		}
		depth++;
	}

	return categoryList;

}
 
Example 41
Project: WikiKreator   File: URLCrawlerForTopic.java   View Source Code Vote up 4 votes
public static List<String> searchGoogle(String query) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException
{
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
	System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");

	List<String> URLList=new ArrayList<String>();
	//java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
	//		BrowserVersionFeatures[] bvf = new BrowserVersionFeatures[1];
	//	    bvf[0] = BrowserVersionFeatures.HTMLIFRAME_IGNORE_SELFCLOSING;
	//	    BrowserVersion bv = new BrowserVersion(
	//	            BrowserVersion.CHROME.getApplicationName(), 
	//	            "5.0 (Windows; en-US)","Chrome",
	//	            (float) 3.6, bvf);

	WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setThrowExceptionOnScriptError(false);
	//webClient.setCssErrorHandler(new SilentCssErrorHandler());
	String finalQuery="https://www.google.com/search?num="+numResults +
			"&q="+query;
	System.out.println(finalQuery);
	HtmlPage page = webClient.getPage(finalQuery);
	Thread.sleep(2000);
	String xmlString=page.asXml();
	File URLMapper=new File("searchResult.xml");
	URLMapper.createNewFile();
	BufferedWriter bw2=new BufferedWriter(new OutputStreamWriter
			(new FileOutputStream(URLMapper.getAbsolutePath()),"UTF-8"));
	bw2.write(xmlString);
	bw2.close();
	Document domDoc =null;
	try {
		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();

		//ByteArrayInputStream bis = new ByteArrayInputStream(str.getBytes());
		domDoc = docBuilder.parse(URLMapper);
	} catch (Exception e) {
		e.printStackTrace();
	}

	DocumentTraversal traversal = (DocumentTraversal) domDoc;
	NodeIterator iterator = traversal.createNodeIterator(domDoc.getDocumentElement(), NodeFilter.SHOW_ELEMENT, null, true);

	for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
		//String tagname = ((Element) n).getAttribute("sectionName");
		//System.out.println(n.getNodeName());
		//			String title="";
		if(n.getNodeName().contentEquals("h3"))
		{
			//if(n.getAttributes().getNamedItem("class").getNodeValue().contentEquals("class=r")
			String j=((Element)n).getAttribute("class");
			//System.out.println(j);
			if(j.contentEquals("r"))
			{
				Node a=n.getChildNodes().item(1);
				String url=((Element) a).getAttribute("href");
				if(!(url.contains("google")||url.contains("wikipedia")||url.contains("amazon")
						||url.contains("amazon"))
						&& url.trim().length()>0)
				{
					URLList.add(url);
				}
			}
			//System.out.println(n.getAttributes().getNamedItem("class"));
			//System.out.println(n.getChildNodes().item(1).getAttributes().getNamedItem("href"));
		}
	}
	System.out.println(URLList);
	webClient.closeAllWindows();
	return URLList;

}
 
Example 42
Project: spring4-sandbox   File: WebDriverCreateTaskITests.java   View Source Code Vote up 4 votes
@Before
public void setUp() throws Exception {
	driver = new HtmlUnitDriver(BrowserVersion.CHROME);
}