edu.uci.ics.crawler4j.crawler.CrawlController Java Examples

The following examples show how to use edu.uci.ics.crawler4j.crawler.CrawlController. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CrawlControllerFactory.java    From vividus with Apache License 2.0 6 votes vote down vote up
@Override
public CrawlController createCrawlController(URI mainApplicationPage)
{
    CrawlConfig crawlConfig = createCrawlConfig(mainApplicationPage);

    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    PageFetcher pageFetcher = new PageFetcher(crawlConfig);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

    try
    {
        return new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
    }
    catch (Exception e)
    {
        throw new IllegalArgumentException(e);
    }
}
 
Example #2
Source File: HeadlessCrawlerTableTransformerTests.java    From vividus with Apache License 2.0 6 votes vote down vote up
@Test
void testFetchUrlsTwiceWithSameProperties()
{
    String seedRelativeUrlsProperty = "/seed1";
    String mainAppPage = buildAppPageUrl(DEFAULT_RELATIVE_URL);
    CrawlController crawlController = mockCrawlerControllerFactory(mainAppPage);
    InOrder ordered = inOrder(crawlControllerFactory, crawlController);
    TableProperties tableProperties = buildTableProperties();
    transformer.setSeedRelativeUrls(toSet(seedRelativeUrlsProperty));
    Set<String> urls = runUrlFetching(mainAppPage, tableProperties,
            List.of(seedRelativeUrlsProperty), crawlController, ordered);
    assertThat(urls, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
    Set<String> urls2 = transformer.fetchUrls(tableProperties);
    verifyNoMoreInteractions(crawlControllerFactory, crawlController);
    assertThat(urls2, equalTo(Set.of(OUTGOING_ABSOLUT_URL)));
    assertSame(urls, urls2);
    verifyNoInteractions(redirectsProvider);
}
 
Example #3
Source File: PornBot.java    From WebVideoBot with MIT License 6 votes vote down vote up
@Override
    public void run(String... args) throws Exception {
        openBrowser();

        PornCrawlControllerFactory crawlControllerFactory = new PornCrawlControllerFactory();
        for (int i = 1; i <= Properties.MAX_PAGE_SIZE;) {
//             補至CONCURRENT_THREAD_SIZE
            for (; runnigCrawl.size() < Properties.CONCURRENT_THREAD_SIZE && i <= Properties.MAX_PAGE_SIZE;) {
                String startUrl = Properties.getNextUrl();
                CrawlController controller = crawlControllerFactory.getController();
                controller.addSeed(startUrl);
                controller.startNonBlocking(pornCrawlerFactory.getObject(), 2);
                runnigCrawl.add(controller);
                logger.info("CrawlController start search url:[{}].", startUrl);
            }

            for (CrawlController crawlController : runnigCrawl) {
                if(crawlController.isFinished()) {
                    runnigCrawl.remove(crawlController);
                    break;
                }
            }
        }
    }
 
Example #4
Source File: CrawlerController.java    From Java-for-Data-Science with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}
 
Example #5
Source File: HtmlCrawlerController.java    From tutorials with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setMaxDepthOfCrawling(2);

    int numCrawlers = 12;

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    controller.addSeed("https://www.baeldung.com/");

    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);

    controller.start(factory, numCrawlers);
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
}
 
Example #6
Source File: ImageCrawlerController.java    From tutorials with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setIncludeBinaryContentInCrawling(true);
    config.setMaxPagesToFetch(500);
    
    File saveDir = new File("src/test/resources/crawler4j");
    
    int numCrawlers = 12;
    
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    
    controller.addSeed("https://www.baeldung.com/");
    
    CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
    
    controller.start(factory, numCrawlers);
}
 
Example #7
Source File: HeadlessCrawlerTableTransformer.java    From vividus with Apache License 2.0 5 votes vote down vote up
private void addSeeds(URI mainApplicationPage, CrawlController controller)
{
    controller.addSeed(mainApplicationPage.toString());
    if (this.seedRelativeUrls == null)
    {
        return;
    }
    String mainApplicationPagePath = StringUtils.appendIfMissing(mainApplicationPage.getPath(), FORWARD_SLASH);
    this.seedRelativeUrls.stream()
            .map(seedRelativeUrl -> StringUtils.removeStart(seedRelativeUrl, FORWARD_SLASH))
            .map(mainApplicationPagePath::concat)
            .map(relativeUrl -> UriUtils.buildNewUrl(mainApplicationPage, relativeUrl))
            .map(URI::toString)
            .forEach(controller::addSeed);
}
 
Example #8
Source File: HeadlessCrawlerTableTransformerTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private Set<String> testFetchUrls(String mainAppPageRelativeUrl, List<String> expectedSeedRelativeUrls)
{
    String mainAppPage = buildAppPageUrl(mainAppPageRelativeUrl);
    CrawlController crawlController = mockCrawlerControllerFactory(mainAppPage);
    TableProperties tableProperties = buildTableProperties();
    return runUrlFetching(mainAppPage, tableProperties, expectedSeedRelativeUrls, crawlController);
}
 
Example #9
Source File: HeadlessCrawlerTableTransformerTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private Set<String> runUrlFetching(String mainAppPage, TableProperties tableProperties,
                                   List<String> expectedSeedRelativeUrls, CrawlController crawlController)
{
    InOrder ordered = inOrder(crawlControllerFactory, crawlController);
    return runUrlFetching(mainAppPage, tableProperties, expectedSeedRelativeUrls,
            crawlController, ordered);
}
 
Example #10
Source File: HeadlessCrawlerTableTransformerTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private Set<String> runUrlFetching(String mainAppPage, TableProperties tableProperties,
                                   List<String> expectedSeedRelativeUrls, CrawlController crawlController,
                                   InOrder ordered)
{
    URI mainAppPageUri = URI.create(mainAppPage);
    doNothing().when(crawlController).start((WebCrawlerFactory<?>) argThat(factory ->
    {
        if (factory instanceof LinkCrawlerFactory)
        {
            LinkCrawler linkCrawler = ((LinkCrawlerFactory) factory).newInstance();
            HtmlParseData htmlParseData = new HtmlParseData();
            String outgoingUrl = UriUtils.buildNewUrl(mainAppPage, OUTGOING_ABSOLUT_URL).toString();
            htmlParseData.setOutgoingUrls(Set.of(createWebUrl(outgoingUrl)));
            String crawlingPageUrl = UriUtils.buildNewUrl(mainAppPage, CRAWLING_RELATIVE_URL).toString();
            WebURL crawlingPageWebUrl = createWebUrl(crawlingPageUrl);
            Page page = new Page(crawlingPageWebUrl);
            page.setParseData(htmlParseData);
            if (linkCrawler.shouldVisit(page, crawlingPageWebUrl))
            {
                linkCrawler.visit(page);
            }
            return true;
        }
        return false;
    }), eq(50));
    Set<String> urls = transformer.fetchUrls(tableProperties);
    ordered.verify(crawlControllerFactory).createCrawlController(mainAppPageUri);
    Stream.concat(Stream.of(mainAppPage),
            expectedSeedRelativeUrls.stream().map(HeadlessCrawlerTableTransformerTests::buildAppPageUrl))
            .forEach(url -> ordered.verify(crawlController).addSeed(url));
    ordered.verify(crawlController).start(any(LinkCrawlerFactory.class), eq(50));
    verifyNoMoreInteractions(crawlController);
    return urls;
}
 
Example #11
Source File: HeadlessCrawlerTableTransformerTests.java    From vividus with Apache License 2.0 5 votes vote down vote up
private CrawlController mockCrawlerControllerFactory(String mainAppPage)
{
    CrawlController crawlController = mock(CrawlController.class);
    URI mainPageURI = URI.create(mainAppPage);
    when(webApplicationConfiguration.getMainApplicationPageUrl()).thenReturn(mainPageURI);
    when(crawlControllerFactory.createCrawlController(mainPageURI))
            .thenReturn(crawlController);
    return crawlController;
}
 
Example #12
Source File: PornCrawlControllerFactory.java    From WebVideoBot with MIT License 5 votes vote down vote up
public CrawlController getController() throws Exception {
    CrawlConfig config = prepareConfig();
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    return controller;
}
 
Example #13
Source File: VsController.java    From visual-spider with MIT License 5 votes vote down vote up
/**
 * 初始化
 *
 * @param numberOfCrawlers 爬虫线程数
 * @param maxDepthOfCrawling 抓取深度
 * @param maxPagesToFetch 最大抓取页数
 * @param politenessDelay 延迟
 * @param links 待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay, String[]
        links) {
    this.numberOfCrawlers = numberOfCrawlers;
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
    config.setMaxDepthOfCrawling(maxDepthOfCrawling);
    config.setIncludeHttpsPages(true);
    config.setMaxPagesToFetch(maxPagesToFetch);
    config.setIncludeBinaryContentInCrawling(false);
    config.setPolitenessDelay(politenessDelay);
    config.setUserAgentString(DefaultConfigValues.USER_AGENT);
    config.setResumableCrawling(true);

    if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
        LOGGER.info("open proxy");
        config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
        config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
        config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
        config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
    }

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    try {
        controller = new CrawlController(config, pageFetcher, robotstxtServer);
        for (String link : links) {
            if (Checker.isHyperLink(link)) {
                controller.addSeed(link);
            }
        }
        isInited = true;
    } catch (Exception e) {
        LOGGER.error("start to crawl urls error: " + e.getMessage());
    }
}
 
Example #14
Source File: Crawler.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private void createCrawler(CrawlConfig config, File storing, List<String> urlSeeds)
{
	
	config.setIncludeBinaryContentInCrawling(true);
       config.setResumableCrawling(false);
       config.setMaxDownloadSize(6250000); //50mb
       
       PageFetcher pageFetcher = new PageFetcher(config);
       RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
       RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
       
       try {
		controller = new CrawlController(config, pageFetcher, robotstxtServer);
		
		for(String seed : urlSeeds)
			controller.addSeed(seed);
		
		core = new CrawlerCore(storing, urlSeeds);
		
		factory = () -> core;
		
		core.getMappingPaths();
		
		
	}  
       catch (Exception e) {
		logger.error("Error in the creation of a crawler:"+e);
	}
}
 
Example #15
Source File: MultipleCrawlerController.java    From tutorials with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    File crawlStorageBase = new File("src/test/resources/crawler4j");
    CrawlConfig htmlConfig = new CrawlConfig();
    CrawlConfig imageConfig = new CrawlConfig();
    
    htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
    imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
    imageConfig.setIncludeBinaryContentInCrawling(true);
    
    htmlConfig.setMaxPagesToFetch(500);
    imageConfig.setMaxPagesToFetch(1000);
    
    PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
    PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
    
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);

    CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
    CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
    
    htmlController.addSeed("https://www.baeldung.com/");
    imageController.addSeed("https://www.baeldung.com/");
    
    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
    
    File saveDir = new File("src/test/resources/crawler4j");
    CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
    
    imageController.startNonBlocking(imageFactory, 7);
    htmlController.startNonBlocking(htmlFactory, 10);
    

    htmlController.waitUntilFinish();
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());

    imageController.waitUntilFinish();
    System.out.printf("Image Crawler is finished.");
    
}
 
Example #16
Source File: ICrawlControllerFactory.java    From vividus with Apache License 2.0 votes vote down vote up
CrawlController createCrawlController(URI mainApplicationPage);