package packt.crawlerj4mavenexample; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; public class CrawlerController { public static void main(String[] args) throws Exception { int numberOfCrawlers = 2; CrawlConfig config = new CrawlConfig(); String crawlStorageFolder = "data"; config.setCrawlStorageFolder(crawlStorageFolder); config.setPolitenessDelay(500); config.setMaxDepthOfCrawling(2); config.setMaxPagesToFetch(20); config.setIncludeBinaryContentInCrawling(false); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly"); controller.start(SampleCrawler.class, numberOfCrawlers); } }