java source code of BasicCrawlController

package navex;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.activation.MimeTypeParseException;

import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicNameValuePair;
import org.htmlparser.tags.FormTag;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;



import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
import edu.uci.ics.crawler4j.crawler.authentication.FormAuthInfo;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.util.IO;
import navex.formula.Trace;
import navex.formula.TraceAnalysis;
import navex.graphDatabase.NavigationDatabaseNode;
import navex.graphDatabase.NeoGraphDatabase;
import opennlp.tools.coref.mention.Parse;
import navex.solver.DynamicSolver;
import navex.solver.Solver;
/**
 * @author Yasser Ganjisaffar
 * @modified by: Abeer Alhuzali
 */
public class BasicCrawlController {
	private static final Logger logger = LoggerFactory.getLogger(BasicCrawlController.class);

	public static CrawlConfig crawlerMain(String[] args, NeoGraphDatabase graph) throws Exception {
		if (args.length < 3) {
			logger.info("Needed parameters: ");
			logger.info("\t rootFolder (it will contain intermediate crawl data)");
			logger.info("\t numberOfCralwers (number of concurrent threads)");
			return null;
		}

		//authentication info is stored in a file 
		String authFile = args[2];

		ArrayList<String[]> authList = IO.readAuthFile(authFile);

		String seed = args[3];



		/*
		 * crawlStorageFolder is a folder where intermediate crawl data is
		 * stored.
		 */
		String crawlStorageFolder = args[0];

		/*
		 * numberOfCrawlers shows the number of concurrent threads that should
		 * be initiated for crawling.
		 */
		int numberOfCrawlers = Integer.parseInt(args[1]);

		int i= 0;CrawlConfig config = null;
		while (i < authList.size()){

			config = startCrawlling(crawlStorageFolder, numberOfCrawlers, authList.get(i), seed, graph );
			i++;
		}


		return  config;

	}




	public static CrawlConfig startCrawlling(String crawlStorageFolder, 
			int numberOfCrawlers, String[] authList, String seed, NeoGraphDatabase graph){

		CrawlConfig config = new CrawlConfig();

		config.setCrawlStorageFolder(crawlStorageFolder);

		/*
		 * Be polite: Make sure that we don't send more than 1 request per
		 * second (1000 milliseconds between requests).
		 */
		// config.setPolitenessDelay(1000);

		/*
		 * You can set the maximum crawl depth here. The default value is -1 for
		 * unlimited depth
		 */
		config.setMaxDepthOfCrawling(20);

		/*
		 * You can set the maximum number of pages to crawl. The default value
		 * is -1 for unlimited number of pages
		 */
		config.setMaxPagesToFetch(1000);

		/**
		 * Do you want crawler4j to crawl also binary data ?
		 * example: the contents of pdf, or the metadata of images etc
		 */
		config.setIncludeBinaryContentInCrawling(false);

		/*
		 * Do you need to set a proxy? If so, you can use:
		 * config.setProxyHost("proxyserver.example.com");
		 * config.setProxyPort(8080);
		 *
		 * If your proxy also needs authentication:
		 * config.setProxyUsername(username); config.getProxyPassword(password);
		 */
		/*
		 * This config parameter can be used to set your crawl to be resumable
		 * (meaning that you can resume the crawl from a previously
		 * interrupted/crashed crawl). Note: if you enable resuming feature and
		 * want to start a fresh crawl, you need to delete the contents of
		 * rootFolder manually.
		 */
		config.setResumableCrawling(false);

		if (authList != null){
			FormAuthInfo auth;
			ArrayList<NameValuePair> nvl = null; 
			try {
				if (authList.length > 5 )
				{
					for (int i = authList.length ; i > 5 ;i=i-2){
						if (nvl == null)
							nvl= new  ArrayList<NameValuePair>();
						NameValuePair nv = new BasicNameValuePair(authList[i-2],authList[i-1]);
						nvl.add(nv);
					}
				}
				auth = new FormAuthInfo 
						(authList[0],authList[1],authList[2],authList[3],authList[4], nvl);

				CrawlConfig.setRole(authList[0]+","+authList[1]);

				HttpClientContext context = HttpClientContext.create();
				BasicCookieStore cookieStore = new BasicCookieStore();
				context.setCookieStore(cookieStore);

				config.addAuthInfo(auth);



			} catch (MalformedURLException e) {
				e.printStackTrace();
			}

		}



		/*
		 * Instantiate the controller for this crawl.
		 */
		PageFetcher pageFetcher = new PageFetcher(config);
		RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
		RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
		CrawlController controller= null;
		try {

			controller = new CrawlController(config, pageFetcher, robotstxtServer);
		} catch (Exception e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}




		/*
		 * For each crawl, you need to add some seed urls. These are the first
		 * URLs that are fetched and then the crawler starts following links
		 * which are found in these pages
		 */
		controller.addSeed(seed);
		//controller.addSeed("http://www.ics.uci.edu/~lopes/");
		//controller.addSeed("http://www.ics.uci.edu/~welling/");

		/*
		 * Start the crawl. This is a blocking operation, meaning that your code
		 * will reach the line after this only when crawling is finished.
		 */
		controller.start(CrawlerFilter.class, numberOfCrawlers);
		logger.info("Crawler 1 is finished.");



		/*After crawlling the whole application and creating the db graph 
	        from only http links. We will process now the forms (stage 2)
		 */
		startCreatingDB(graph);


		return config;

	}


	public static String getCookieValue(CookieStore cookieStore, String cookieName) {
		String value = null;
		for (Cookie cookie: cookieStore.getCookies()) {
			if (cookie.getName().equals(cookieName)) {
				value = cookie.getValue();
			}
		}
		return value;
	}


	private static void startCreatingDB(NeoGraphDatabase graph) {
		logger.info("Start Building the navigation graph  ......\n");


		for (Page page :CrawlerFilter.getAllPages())
			graph.StartNeoDb(page, null, "link", null, null);


		graph.addLinksStart();
	}


}