/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapreduce.lib.input; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.junit.Test; public class TestLineRecordReader { private void testSplitRecords(String testFileName, long firstSplitLength) throws IOException { URL testFileUrl = getClass().getClassLoader().getResource(testFileName); assertNotNull("Cannot find " + testFileName, testFileUrl); File testFile = new File(testFileUrl.getFile()); long testFileSize = testFile.length(); Path testFilePath = new Path(testFile.getAbsolutePath()); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input. LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); assertTrue("unexpected test data at " + testFile, testFileSize > firstSplitLength); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data without splitting to count the records FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[])null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); int numRecordsNoSplits = 0; while (reader.nextKeyValue()) { ++numRecordsNoSplits; } reader.close(); // count the records in the first split split = new FileSplit(testFilePath, 0, firstSplitLength, (String[])null); reader = new LineRecordReader(); reader.initialize(split, context); int numRecordsFirstSplit = 0; while (reader.nextKeyValue()) { ++numRecordsFirstSplit; } reader.close(); // count the records in the second split split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength, (String[])null); reader = new LineRecordReader(); reader.initialize(split, context); int numRecordsRemainingSplits = 0; while (reader.nextKeyValue()) { ++numRecordsRemainingSplits; } reader.close(); assertEquals("Unexpected number of records in bzip2 compressed split", numRecordsNoSplits, numRecordsFirstSplit + numRecordsRemainingSplits); } @Test public void testBzip2SplitEndsAtCR() throws IOException { // the test data contains a carriage-return at the end of the first // split which ends at compressed offset 136498 and the next // character is not a linefeed testSplitRecords("blockEndingInCR.txt.bz2", 136498); } @Test public void testBzip2SplitEndsAtCRThenLF() throws IOException { // the test data contains a carriage-return at the end of the first // split which ends at compressed offset 136498 and the next // character is a linefeed testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498); } // Use the LineRecordReader to read records from the file public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException { // Set up context File testFile = new File(testFileUrl.getFile()); long testFileSize = testFile.length(); Path testFilePath = new Path(testFile.getAbsolutePath()); Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 1); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // Gather the records returned by the record reader ArrayList<String> records = new ArrayList<String>(); long offset = 0; while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); while (reader.nextKeyValue()) { records.add(reader.getCurrentValue().toString()); } offset += splitSize; } return records; } // Gather the records by just splitting on new lines public String[] readRecordsDirectly(URL testFileUrl, boolean bzip) throws IOException { int MAX_DATA_SIZE = 1024 * 1024; byte[] data = new byte[MAX_DATA_SIZE]; FileInputStream fis = new FileInputStream(testFileUrl.getFile()); int count; if (bzip) { BZip2CompressorInputStream bzIn = new BZip2CompressorInputStream(fis); count = bzIn.read(data); bzIn.close(); } else { count = fis.read(data); } fis.close(); assertTrue("Test file data too big for buffer", count < data.length); return new String(data, 0, count, "UTF-8").split("\n"); } public void checkRecordSpanningMultipleSplits(String testFile, int splitSize, boolean bzip) throws IOException { URL testFileUrl = getClass().getClassLoader().getResource(testFile); ArrayList<String> records = readRecords(testFileUrl, splitSize); String[] actuals = readRecordsDirectly(testFileUrl, bzip); assertEquals("Wrong number of records", actuals.length, records.size()); boolean hasLargeRecord = false; for (int i = 0; i < actuals.length; ++i) { assertEquals(actuals[i], records.get(i)); if (actuals[i].length() > 2 * splitSize) { hasLargeRecord = true; } } assertTrue("Invalid test data. Doesn't have a large enough record", hasLargeRecord); } @Test public void testRecordSpanningMultipleSplits() throws IOException { checkRecordSpanningMultipleSplits("recordSpanningMultipleSplits.txt", 10, false); } @Test public void testRecordSpanningMultipleSplitsCompressed() throws IOException { // The file is generated with bz2 block size of 100k. The split size // needs to be larger than that for the CompressedSplitLineReader to // work. checkRecordSpanningMultipleSplits("recordSpanningMultipleSplits.txt.bz2", 200 * 1000, true); } @Test public void testStripBOM() throws IOException { // the test data contains a BOM at the start of the file // confirm the BOM is skipped by LineRecordReader String UTF8_BOM = "\uFEFF"; URL testFileUrl = getClass().getClassLoader().getResource("testBOM.txt"); assertNotNull("Cannot find testBOM.txt", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input. LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[])null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); int numRecords = 0; boolean firstLine = true; boolean skipBOM = true; while (reader.nextKeyValue()) { if (firstLine) { firstLine = false; if (reader.getCurrentValue().toString().startsWith(UTF8_BOM)) { skipBOM = false; } } ++numRecords; } reader.close(); assertTrue("BOM is not skipped", skipBOM); } @Test public void testMultipleClose() throws IOException { URL testFileUrl = getClass().getClassLoader(). getResource("recordSpanningMultipleSplits.txt.bz2"); assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input. LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); //noinspection StatementWithEmptyBody while (reader.nextKeyValue()) ; reader.close(); reader.close(); BZip2Codec codec = new BZip2Codec(); codec.setConf(conf); Set<Decompressor> decompressors = new HashSet<Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.add(CodecPool.getDecompressor(codec)); } assertEquals(10, decompressors.size()); } }