Java Code Examples for org.apache.flink.api.common.io.statistics.BaseStatistics

The following examples show how to use org.apache.flink.api.common.io.statistics.BaseStatistics. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: flink   Source File: EnumerateNestedFilesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsOneFileInNestedDir() {
	try {
		final long SIZE = 1024 * 500;
		String firstLevelDir = TestFileUtils.randomFileName();
		String secondLevelDir = TestFileUtils.randomFileName();

		File insideNestedDir = tempFolder.newFolder(firstLevelDir, secondLevelDir);
		File nestedDir = insideNestedDir.getParentFile();

		// create a file in the nested dir
		TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE);

		this.format.setFilePath(new Path(nestedDir.toURI().toString()));
		this.config.setBoolean("recursive.file.enumeration", true);
		format.configure(this.config);

		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 2
Source Project: Flink-CEPplus   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsOneFileNoCachedVersion() {
	try {
		final long SIZE = 1024 * 500;
		String tempFile = TestFileUtils.createTempFile(SIZE);
		
		final DummyFileInputFormat format = new DummyFileInputFormat();
		format.setFilePath(tempFile);
		format.configure(new Configuration());
		
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 3
Source Project: Flink-CEPplus   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultipleFilesNoCachedVersion() {
	try {
		final long SIZE1 = 2077;
		final long SIZE2 = 31909;
		final long SIZE3 = 10;
		final long TOTAL = SIZE1 + SIZE2 + SIZE3;
		
		String tempDir = TestFileUtils.createTempFileDir(temporaryFolder.newFolder(), SIZE1, SIZE2, SIZE3);
		
		final DummyFileInputFormat format = new DummyFileInputFormat();
		format.setFilePath(tempDir);
		format.configure(new Configuration());
		
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 4
Source Project: flink   Source File: FileInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path[] filePaths, ArrayList<FileStatus> files) throws IOException {

		long totalLength = 0;
		long latestModTime = 0;

		for (Path path : filePaths) {
			final FileSystem fs = FileSystem.get(path.toUri());
			final FileBaseStatistics stats = getFileStats(cachedStats, path, fs, files);

			if (stats.getTotalInputSize() == BaseStatistics.SIZE_UNKNOWN) {
				totalLength = BaseStatistics.SIZE_UNKNOWN;
			} else if (totalLength != BaseStatistics.SIZE_UNKNOWN) {
				totalLength += stats.getTotalInputSize();
			}
			latestModTime = Math.max(latestModTime, stats.getLastModificationTime());
		}

		// check whether the cached statistics are still valid, if we have any
		if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
			return cachedStats;
		}

		return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
	}
 
Example 5
Source Project: Flink-CEPplus   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultipleFilesMultiplePathsNoCachedVersion() throws IOException {
	final long size1 = 2077;
	final long size2 = 31909;
	final long size3 = 10;
	final long totalSize123 = size1 + size2 + size3;
	
	String tempDir = TestFileUtils.createTempFileDir(temporaryFolder.newFolder(), size1, size2, size3);
	
	final long size4 = 2051;
	final long size5 = 31902;
	final long size6 = 15;
	final long totalSize456 = size4 + size5 + size6;
	String tempDir2 = TestFileUtils.createTempFileDir(temporaryFolder.newFolder(), size4, size5, size6);

	final MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
	format.setFilePaths(tempDir, tempDir2);
	format.configure(new Configuration());
	
	BaseStatistics stats = format.getStatistics(null);
	Assert.assertEquals("The file size from the statistics is wrong.", totalSize123 + totalSize456, stats.getTotalInputSize());
}
 
Example 6
Source Project: Flink-CEPplus   Source File: BinaryInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultiplePaths() throws IOException {
	final int blockInfoSize = new BlockInfo().getInfoSize();
	final int blockSize = blockInfoSize + 8;
	final int numBlocks1 = 3;
	final int numBlocks2 = 5;
	
	final File tempFile = createBinaryInputFile("binary_input_format_test", blockSize, numBlocks1);
	final File tempFile2 = createBinaryInputFile("binary_input_format_test_2", blockSize, numBlocks2);
	
	final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat();
	inputFormat.setFilePaths(tempFile.toURI().toString(), tempFile2.toURI().toString());
	inputFormat.setBlockSize(blockSize);

	BaseStatistics stats = inputFormat.getStatistics(null);
	Assert.assertEquals("The file size statistics is wrong", blockSize * (numBlocks1 + numBlocks2), stats.getTotalInputSize());
}
 
Example 7
Source Project: Flink-CEPplus   Source File: EnumerateNestedFilesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsOneFileInNestedDir() {
	try {
		final long SIZE = 1024 * 500;
		String firstLevelDir = TestFileUtils.randomFileName();
		String secondLevelDir = TestFileUtils.randomFileName();

		File insideNestedDir = tempFolder.newFolder(firstLevelDir, secondLevelDir);
		File nestedDir = insideNestedDir.getParentFile();

		// create a file in the nested dir
		TestFileUtils.createTempFileInDirectory(insideNestedDir.getAbsolutePath(), SIZE);

		this.format.setFilePath(new Path(nestedDir.toURI().toString()));
		this.config.setBoolean("recursive.file.enumeration", true);
		format.configure(this.config);

		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 8
Source Project: flink   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultipleOneFileNoCachedVersion() throws IOException {
	final long size1 = 1024 * 500;
	String tempFile = TestFileUtils.createTempFile(size1);

	final long size2 = 1024 * 505;
	String tempFile2 = TestFileUtils.createTempFile(size2);

	final long totalSize = size1 + size2;
	
	final MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
	format.setFilePaths(tempFile, tempFile2);
	format.configure(new Configuration());
	
	BaseStatistics stats = format.getStatistics(null);
	Assert.assertEquals("The file size from the statistics is wrong.", totalSize, stats.getTotalInputSize());
}
 
Example 9
Source Project: flink   Source File: FileInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Obtains basic file statistics containing only file size. If the input is a directory, then the size is the sum of all contained files.
 * 
 * @see org.apache.flink.api.common.io.InputFormat#getStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics)
 */
@Override
public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	
	final FileBaseStatistics cachedFileStats = cachedStats instanceof FileBaseStatistics ?
		(FileBaseStatistics) cachedStats : null;
			
	try {
		return getFileStats(cachedFileStats, getFilePaths(), new ArrayList<>(getFilePaths().length));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics for paths '" + Arrays.toString(getFilePaths()) + "' due to an io error: "
					+ ioex.getMessage());
		}
	}
	catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics for paths '" + Arrays.toString(getFilePaths()) + "': "
					+ t.getMessage(), t);
		}
	}
	
	// no statistics available
	return null;
}
 
Example 10
Source Project: flink   Source File: FileInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path[] filePaths, ArrayList<FileStatus> files) throws IOException {

		long totalLength = 0;
		long latestModTime = 0;

		for (Path path : filePaths) {
			final FileSystem fs = FileSystem.get(path.toUri());
			final FileBaseStatistics stats = getFileStats(cachedStats, path, fs, files);

			if (stats.getTotalInputSize() == BaseStatistics.SIZE_UNKNOWN) {
				totalLength = BaseStatistics.SIZE_UNKNOWN;
			} else if (totalLength != BaseStatistics.SIZE_UNKNOWN) {
				totalLength += stats.getTotalInputSize();
			}
			latestModTime = Math.max(latestModTime, stats.getLastModificationTime());
		}

		// check whether the cached statistics are still valid, if we have any
		if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
			return cachedStats;
		}

		return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
	}
 
Example 11
Source Project: flink   Source File: BinaryInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultiplePaths() throws IOException {
	final int blockInfoSize = new BlockInfo().getInfoSize();
	final int blockSize = blockInfoSize + 8;
	final int numBlocks1 = 3;
	final int numBlocks2 = 5;
	
	final File tempFile = createBinaryInputFile("binary_input_format_test", blockSize, numBlocks1);
	final File tempFile2 = createBinaryInputFile("binary_input_format_test_2", blockSize, numBlocks2);
	
	final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat();
	inputFormat.setFilePaths(tempFile.toURI().toString(), tempFile2.toURI().toString());
	inputFormat.setBlockSize(blockSize);

	BaseStatistics stats = inputFormat.getStatistics(null);
	Assert.assertEquals("The file size statistics is wrong", blockSize * (numBlocks1 + numBlocks2), stats.getTotalInputSize());
}
 
Example 12
Source Project: flink   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsOneFileNoCachedVersion() {
	try {
		final long SIZE = 1024 * 500;
		String tempFile = TestFileUtils.createTempFile(SIZE);
		
		final DummyFileInputFormat format = new DummyFileInputFormat();
		format.setFilePath(tempFile);
		format.configure(new Configuration());
		
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 13
Source Project: flink   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultipleOneFileNoCachedVersion() throws IOException {
	final long size1 = 1024 * 500;
	String tempFile = TestFileUtils.createTempFile(size1);

	final long size2 = 1024 * 505;
	String tempFile2 = TestFileUtils.createTempFile(size2);

	final long totalSize = size1 + size2;
	
	final MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
	format.setFilePaths(tempFile, tempFile2);
	format.configure(new Configuration());
	
	BaseStatistics stats = format.getStatistics(null);
	Assert.assertEquals("The file size from the statistics is wrong.", totalSize, stats.getTotalInputSize());
}
 
Example 14
Source Project: flink   Source File: FileInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultipleFilesMultiplePathsNoCachedVersion() throws IOException {
	final long size1 = 2077;
	final long size2 = 31909;
	final long size3 = 10;
	final long totalSize123 = size1 + size2 + size3;
	
	String tempDir = TestFileUtils.createTempFileDir(temporaryFolder.newFolder(), size1, size2, size3);
	
	final long size4 = 2051;
	final long size5 = 31902;
	final long size6 = 15;
	final long totalSize456 = size4 + size5 + size6;
	String tempDir2 = TestFileUtils.createTempFileDir(temporaryFolder.newFolder(), size4, size5, size6);

	final MultiDummyFileInputFormat format = new MultiDummyFileInputFormat();
	format.setFilePaths(tempDir, tempDir2);
	format.configure(new Configuration());
	
	BaseStatistics stats = format.getStatistics(null);
	Assert.assertEquals("The file size from the statistics is wrong.", totalSize123 + totalSize456, stats.getTotalInputSize());
}
 
Example 15
Source Project: flink   Source File: BinaryInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetStatisticsMultiplePaths() throws IOException {
	final int blockInfoSize = new BlockInfo().getInfoSize();
	final int blockSize = blockInfoSize + 8;
	final int numBlocks1 = 3;
	final int numBlocks2 = 5;
	
	final File tempFile = createBinaryInputFile("binary_input_format_test", blockSize, numBlocks1);
	final File tempFile2 = createBinaryInputFile("binary_input_format_test_2", blockSize, numBlocks2);
	
	final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat();
	inputFormat.setFilePaths(tempFile.toURI().toString(), tempFile2.toURI().toString());
	inputFormat.setBlockSize(blockSize);

	BaseStatistics stats = inputFormat.getStatistics(null);
	Assert.assertEquals("The file size statistics is wrong", blockSize * (numBlocks1 + numBlocks2), stats.getTotalInputSize());
}
 
Example 16
Source Project: beam   Source File: ImpulseInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics baseStatistics) {
  return new BaseStatistics() {
    @Override
    public long getTotalInputSize() {
      return 1;
    }

    @Override
    public long getNumberOfRecords() {
      return 1;
    }

    @Override
    public float getAverageRecordWidth() {
      return 1;
    }
  };
}
 
Example 17
Source Project: flink   Source File: SequentialFormatTestBase.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Tests the statistics of the given format.
 */
@Test
public void checkStatistics() {
	BinaryInputFormat<T> input = this.createInputFormat();
	BaseStatistics statistics = input.getStatistics(null);
	Assert.assertEquals(this.numberOfTuples, statistics.getNumberOfRecords());
}
 
Example 18
Source Project: Flink-CEPplus   Source File: HadoopInputFormatBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapreduceInputFormat instanceof FileInputFormat)) {
		return null;
	}

	JobContext jobContext = new JobContextImpl(configuration, null);

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 19
Source Project: flink   Source File: BinaryInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetStatisticsNonExistingFiles() {
	final MyBinaryInputFormat format = new MyBinaryInputFormat();
	format.setFilePaths("file:///some/none/existing/directory/", "file:///another/none/existing/directory/");
	format.configure(new Configuration());
	
	BaseStatistics stats = format.getStatistics(null);
	Assert.assertNull("The file statistics should be null.", stats);
}
 
Example 20
Source Project: Flink-CEPplus   Source File: FileInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path filePath, FileSystem fs, ArrayList<FileStatus> files) throws IOException {

		// get the file info and check whether the cached statistics are still valid.
		final FileStatus file = fs.getFileStatus(filePath);
		long totalLength = 0;

		// enumerate all files
		if (file.isDir()) {
			totalLength += addFilesInDir(file.getPath(), files, false);
		} else {
			files.add(file);
			testForUnsplittable(file);
			totalLength += file.getLen();
		}

		// check the modification time stamp
		long latestModTime = 0;
		for (FileStatus f : files) {
			latestModTime = Math.max(f.getModificationTime(), latestModTime);
		}

		// check whether the cached statistics are still valid, if we have any
		if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) {
			return cachedStats;
		}

		// sanity check
		if (totalLength <= 0) {
			totalLength = BaseStatistics.SIZE_UNKNOWN;
		}
		return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
	}
 
Example 21
Source Project: Flink-CEPplus   Source File: BinaryInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SequentialStatistics getStatistics(BaseStatistics cachedStats) {

	final FileBaseStatistics cachedFileStats = cachedStats instanceof FileBaseStatistics ?
		(FileBaseStatistics) cachedStats : null;

	try {
		final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
		final FileBaseStatistics stats = getFileStats(cachedFileStats, getFilePaths(), allFiles);
		if (stats == null) {
			return null;
		}
		// check whether the file stats are still sequential stats (in that case they are still valid)
		if (stats instanceof SequentialStatistics) {
			return (SequentialStatistics) stats;
		}
		return createStatistics(allFiles, stats);
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn(
				String.format("Could not determine complete statistics for files '%s' due to an I/O error",
					Arrays.toString(getFilePaths())),
				ioex);
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error(
				String.format("Unexpected problem while getting the file statistics for files '%s'",
					Arrays.toString(getFilePaths())),
				t);
		}
	}
	// no stats available
	return null;
}
 
Example 22
Source Project: Flink-CEPplus   Source File: FileInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetStatisticsNonExistingFile() {
	try {
		final DummyFileInputFormat format = new DummyFileInputFormat();
		format.setFilePath("file:///some/none/existing/directory/");
		format.configure(new Configuration());
		
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertNull("The file statistics should be null.", stats);
	} catch (Exception ex) {
		ex.printStackTrace();
		Assert.fail(ex.getMessage());
	}
}
 
Example 23
Source Project: Flink-CEPplus   Source File: FileInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetStatsIgnoredUnderscoreFiles() {
	try {
		final int SIZE = 2048;
		final long TOTAL = 2*SIZE;

		// create two accepted and two ignored files
		File child1 = temporaryFolder.newFile("dataFile1.txt");
		File child2 = temporaryFolder.newFile("another_file.bin");
		File luigiFile = temporaryFolder.newFile("_luigi");
		File success = temporaryFolder.newFile("_SUCCESS");

		createTempFiles(new byte[SIZE], child1, child2, luigiFile, success);

		final DummyFileInputFormat format = new DummyFileInputFormat();
		format.setFilePath(temporaryFolder.getRoot().toURI().toString());
		format.configure(new Configuration());

		// check that only valid files are used for statistics computation
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals(TOTAL, stats.getTotalInputSize());
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 24
Source Project: cascading-flink   Source File: TapInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 25
Source Project: flink   Source File: HadoopInputFormatBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapreduceInputFormat instanceof FileInputFormat)) {
		return null;
	}

	JobContext jobContext = new JobContextImpl(configuration, null);

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 26
@Test
public void testSamplingDirectory() {
	try {
		final String tempFile = TestFileUtils.createTempFileDir(testTempFolder, TEST_DATA1, TEST_DATA2);
		final Configuration conf = new Configuration();
		
		final TestDelimitedInputFormat format = new TestDelimitedInputFormat(CONFIG);
		format.setFilePath(tempFile);
		format.configure(conf);
		BaseStatistics stats = format.getStatistics(null);
		
		final int maxNumLines = (int) Math.ceil(TOTAL_SIZE / ((double) Math.min(TEST_DATA_1_LINEWIDTH, TEST_DATA_2_LINEWIDTH)));
		final int minNumLines = (int) (TOTAL_SIZE / ((double) Math.max(TEST_DATA_1_LINEWIDTH, TEST_DATA_2_LINEWIDTH)));
		final float maxAvgWidth = ((float) (TOTAL_SIZE)) / minNumLines;
		final float minAvgWidth = ((float) (TOTAL_SIZE)) / maxNumLines;
		
		if (!(stats.getNumberOfRecords() <= maxNumLines  & stats.getNumberOfRecords() >= minNumLines)) {
			System.err.println("Records: " + stats.getNumberOfRecords() + " out of (" + minNumLines + ", " + maxNumLines + ").");
			Assert.fail("Wrong record count.");
		}
		if (!(stats.getAverageRecordWidth() <= maxAvgWidth & stats.getAverageRecordWidth() >= minAvgWidth)) {
			Assert.fail("Wrong avg record size.");
		}
	} catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 27
@Test
public void testCachedStatistics() {
	try {
		final String tempFile = TestFileUtils.createTempFile(TEST_DATA1);
		final Configuration conf = new Configuration();
		
		final TestDelimitedInputFormat format = new TestDelimitedInputFormat(CONFIG);
		format.setFilePath("test://" + tempFile);
		format.configure(conf);
		
		TestFileSystem.resetStreamOpenCounter();
		BaseStatistics stats = format.getStatistics(null);
		Assert.assertEquals("Wrong number of samples taken.", DEFAULT_NUM_SAMPLES, TestFileSystem.getNumtimeStreamOpened());
		
		final TestDelimitedInputFormat format2 = new TestDelimitedInputFormat(CONFIG);
		format2.setFilePath("test://" + tempFile);
		format2.configure(conf);
		
		TestFileSystem.resetStreamOpenCounter();
		BaseStatistics stats2 = format2.getStatistics(stats);
		Assert.assertTrue("Using cached statistics should cicumvent sampling.", 0 == TestFileSystem.getNumtimeStreamOpened());
		Assert.assertTrue("Using cached statistics should cicumvent sampling.", stats == stats2);
		
	} catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 28
Source Project: flink   Source File: DelimitedInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetStatisticsSingleFileWithCachedVersion() throws IOException {
	final String myString = "my mocked line 1\nmy mocked line 2\n";
	final Path tempFile = createTempFilePath(myString);
	final long size = myString.length();
	final long fakeSize = 10065;

	DelimitedInputFormat<String> format = new MyTextInputFormat();
	format.setFilePath(tempFile);
	format.configure(new Configuration());

	FileBaseStatistics stats = format.getStatistics(null);
	assertNotNull(stats);
	assertEquals("The file size from the statistics is wrong.", size, stats.getTotalInputSize());
	
	format = new MyTextInputFormat();
	format.setFilePath(tempFile);
	format.configure(new Configuration());
	
	FileBaseStatistics newStats = format.getStatistics(stats);
	assertEquals("Statistics object was changed.", newStats, stats);
	
	// insert fake stats with the correct modification time. the call should return the fake stats
	format = new MyTextInputFormat();
	format.setFilePath(tempFile);
	format.configure(new Configuration());
	
	FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
	BaseStatistics latest = format.getStatistics(fakeStats);
	assertEquals("The file size from the statistics is wrong.", fakeSize, latest.getTotalInputSize());
	
	// insert fake stats with the expired modification time. the call should return new accurate stats
	format = new MyTextInputFormat();
	format.setFilePath(tempFile);
	format.configure(new Configuration());
	
	FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime() - 1, fakeSize, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
	BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
	assertEquals("The file size from the statistics is wrong.", size, reGathered.getTotalInputSize());
}
 
Example 29
Source Project: flink   Source File: HadoopInputFormatBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 30
Source Project: flink   Source File: HadoopInputFormatBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapreduceInputFormat instanceof FileInputFormat)) {
		return null;
	}

	JobContext jobContext = new JobContextImpl(configuration, null);

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(jobContext);
		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}