Java Code Examples for org.apache.hadoop.util.ReflectionUtils#setConf()

The following examples show how to use org.apache.hadoop.util.ReflectionUtils#setConf() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WritableComparator.java    From big-c with Apache License 2.0 6 votes vote down vote up
/** Get a comparator for a {@link WritableComparable} implementation. */
public static WritableComparator get(
    Class<? extends WritableComparable> c, Configuration conf) {
  WritableComparator comparator = comparators.get(c);
  if (comparator == null) {
    // force the static initializers to run
    forceInit(c);
    // look to see if it is defined now
    comparator = comparators.get(c);
    // if not, use the generic one
    if (comparator == null) {
      comparator = new WritableComparator(c, conf, true);
    }
  }
  // Newly passed Configuration objects should be used.
  ReflectionUtils.setConf(comparator, conf);
  return comparator;
}
 
Example 2
Source File: TestFixedLengthInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws IOException {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  JobConf job = new JobConf(defaultConf);
  format.setRecordLength(job, 5);
  FileInputFormat.setInputPaths(job, workDir);
  ReflectionUtils.setConf(gzip, job);
  format.configure(job);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<String> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}
 
Example 3
Source File: TestTextInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}
 
Example 4
Source File: SequenceFile.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/** Initialize. */
@SuppressWarnings("unchecked")
void init(Path name, Configuration conf, FSDataOutputStream out,
          Class keyClass, Class valClass,
          boolean compress, CompressionCodec codec, Metadata metadata) 
  throws IOException {
  this.conf = conf;
  this.out = out;
  this.keyClass = keyClass;
  this.valClass = valClass;
  this.compress = compress;
  this.codec = codec;
  this.metadata = metadata;
  SerializationFactory serializationFactory = new SerializationFactory(conf);
  this.keySerializer = serializationFactory.getSerializer(keyClass);
  this.keySerializer.open(buffer);
  this.uncompressedValSerializer = serializationFactory.getSerializer(valClass);
  this.uncompressedValSerializer.open(buffer);
  if (this.codec != null) {
    ReflectionUtils.setConf(this.codec, this.conf);
    this.compressor = CodecPool.getCompressor(this.codec);
    this.deflateFilter = this.codec.createOutputStream(buffer, compressor);
    this.deflateOut = 
      new DataOutputStream(new BufferedOutputStream(deflateFilter));
    this.compressedValSerializer = serializationFactory.getSerializer(valClass);
    this.compressedValSerializer.open(deflateOut);
  }
}
 
Example 5
Source File: TestFixedLengthInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws Exception {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  Job job = Job.getInstance(defaultConf);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 5);
  ReflectionUtils.setConf(gzip, job.getConfiguration());
  FileInputFormat.setInputPaths(job, workDir);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 2", 2, splits.size());
  FileSplit tmp = (FileSplit) splits.get(0);
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits.set(0, splits.get(1));
    splits.set(1, tmp);
  }
  List<String> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits.get(1), job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}
 
Example 6
Source File: TestFixedLengthInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws Exception {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  Job job = Job.getInstance(defaultConf);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 5);
  ReflectionUtils.setConf(gzip, job.getConfiguration());
  FileInputFormat.setInputPaths(job, workDir);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 2", 2, splits.size());
  FileSplit tmp = (FileSplit) splits.get(0);
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits.set(0, splits.get(1));
    splits.set(1, tmp);
  }
  List<String> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits.get(1), job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}
 
Example 7
Source File: TestCombineTextInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 1", 1, splits.size());
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}
 
Example 8
Source File: HadoopInputFormatBase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
	super(job.getCredentials());
	this.mapredInputFormat = mapredInputFormat;
	this.keyClass = key;
	this.valueClass = value;
	HadoopUtils.mergeHadoopConf(job);
	this.jobConf = job;
	ReflectionUtils.setConf(mapredInputFormat, jobConf);
}
 
Example 9
Source File: TestCombineTextInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 1", 1, splits.length);
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}
 
Example 10
Source File: TestKeyValueTextInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "line-1\tthis is a test\nline-1\tof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  KeyValueTextInputFormat format = new KeyValueTextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}
 
Example 11
Source File: TestTextInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test (timeout=5000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}
 
Example 12
Source File: TestKeyValueTextInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "line-1\tthis is a test\nline-1\tof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  KeyValueTextInputFormat format = new KeyValueTextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}
 
Example 13
Source File: TestTextInputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}
 
Example 14
Source File: TestFixedLengthInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
private void runPartialRecordTest(CompressionCodec codec) throws IOException {
  localFs.delete(workDir, true);
  // Create a file with fixed length records with 5 byte long
  // records with a partial record at the end.
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  JobConf job = new JobConf(defaultConf);
  format.setRecordLength(job, 5);
  FileInputFormat.setInputPaths(job, workDir);
  if (codec != null) {
    ReflectionUtils.setConf(codec, job);
  }
  format.configure(job);
  writeFile(localFs, new Path(workDir, fileName.toString()), codec,
          "one  two  threefour five six  seveneightnine ten");
  InputSplit[] splits = format.getSplits(job, 100);
  if (codec != null) {
    assertEquals("compressed splits == 1", 1, splits.length);
  }
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      List<String> results = readSplit(format, split, job);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for partial record:", exceptionThrown);
}
 
Example 15
Source File: HadoopInputFormatBase.java    From flink with Apache License 2.0 5 votes vote down vote up
public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
	super(job.getCredentials());
	this.mapredInputFormat = mapredInputFormat;
	this.keyClass = key;
	this.valueClass = value;
	HadoopUtils.mergeHadoopConf(job);
	this.jobConf = job;
	ReflectionUtils.setConf(mapredInputFormat, jobConf);
}
 
Example 16
Source File: TestFixedLengthInputFormat.java    From hadoop with Apache License 2.0 4 votes vote down vote up
private void runRandomTests(CompressionCodec codec) throws IOException {
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  localFs.delete(workDir, true);
  Path file = new Path(workDir, fileName.toString());
  int seed = new Random().nextInt();
  LOG.info("Seed = " + seed);
  Random random = new Random(seed);
  int MAX_TESTS = 20;
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  for (int i = 0; i < MAX_TESTS; i++) {
    LOG.info("----------------------------------------------------------");
    // Maximum total records of 999
    int totalRecords = random.nextInt(999)+1;
    // Test an empty file
    if (i == 8) {
       totalRecords = 0;
    }
    // Maximum bytes in a record of 100K
    int recordLength = random.nextInt(1024*100)+1;
    // For the 11th test, force a record length of 1
    if (i == 10) {
      recordLength = 1;
    }
    // The total bytes in the test file
    int fileSize = (totalRecords * recordLength);
    LOG.info("totalRecords=" + totalRecords + " recordLength="
        + recordLength);
    // Create the job 
    JobConf job = new JobConf(defaultConf);
    if (codec != null) {
      ReflectionUtils.setConf(codec, job);
    }
    // Create the test file
    ArrayList<String> recordList
        = createFile(file, codec, recordLength, totalRecords);
    assertTrue(localFs.exists(file));
    //set the fixed length record length config property for the job
    FixedLengthInputFormat.setRecordLength(job, recordLength);

    int numSplits = 1;
    // Arbitrarily set number of splits.
    if (i > 0) {
      if (i == (MAX_TESTS-1)) {
        // Test a split size that is less than record len
        numSplits = (int)(fileSize/Math.floor(recordLength/2));
      } else {
        if (MAX_TESTS % i == 0) {
          // Let us create a split size that is forced to be 
          // smaller than the end file itself, (ensures 1+ splits)
          numSplits = fileSize/(fileSize - random.nextInt(fileSize));
        } else {
          // Just pick a random split size with no upper bound 
          numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE));
        }
      }
      LOG.info("Number of splits set to: " + numSplits);
    }

    // Setup the input path
    FileInputFormat.setInputPaths(job, workDir);
    // Try splitting the file in a variety of sizes
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.configure(job);
    InputSplit splits[] = format.getSplits(job, numSplits);
    LOG.info("Actual number of splits = " + splits.length);
    // Test combined split lengths = total file size
    long recordOffset = 0;
    int recordNumber = 0;
    for (InputSplit split : splits) {
      RecordReader<LongWritable, BytesWritable> reader = 
          format.getRecordReader(split, job, voidReporter);
      Class<?> clazz = reader.getClass();
      assertEquals("RecordReader class should be FixedLengthRecordReader:", 
          FixedLengthRecordReader.class, clazz);
      // Plow through the records in this split
      while (reader.next(key, value)) {
        assertEquals("Checking key", (long)(recordNumber*recordLength),
            key.get());
        String valueString =
            new String(value.getBytes(), 0, value.getLength());
        assertEquals("Checking record length:", recordLength,
            value.getLength());
        assertTrue("Checking for more records than expected:",
            recordNumber < totalRecords);
        String origRecord = recordList.get(recordNumber);
        assertEquals("Checking record content:", origRecord, valueString);
        recordNumber++;
      }
      reader.close();
    }
    assertEquals("Total original records should be total read records:",
        recordList.size(), recordNumber);
  }
}
 
Example 17
Source File: TestConcatenatedCompressedInput.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
   * Test using Hadoop's original, native-zlib gzip codec for reading.
   */
  @Test
  public void testGzip() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);

    // preferred, but not compatible with Apache/trunk instance of Hudson:
/*
    assertFalse("[native (C/C++) codec]",
      (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
       gzip.getDecompressorType()) );
    System.out.println(COLOR_BR_RED +
      "testGzip() using native-zlib Decompressor (" +
      gzip.getDecompressorType() + ")" + COLOR_NORMAL);
 */

    // alternative:
    if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
        gzip.getDecompressorType()) {
      System.out.println(COLOR_BR_RED +
        "testGzip() using native-zlib Decompressor (" +
        gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    } else {
      LOG.warn("testGzip() skipped:  native (C/C++) libs not loaded");
      return;
    }

/*
 *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
 *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
 *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
 *  //OutputStream out = localFs.create(fnHDFS);
 *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
 *      // can just combine those two lines, probably
 *  //GzipCodec.GzipOutputStream gzOStm =
 *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
 *      // oops, no:  this is a protected helper class; need to access
 *      //   it via createOutputStream() instead:
 *  OutputStream out = localFs.create(fnHDFS);
 *  Compressor gzCmp = gzip.createCompressor();
 *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
 *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
 *      //   ...yup, works
 *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("2nd gzip concat member\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
 *  gzOStm.close();
 *      //
 *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
 *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
 *  localFs.copyToLocalFile(fnHDFS, fnLocal);
 */

    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS  = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "this is a test\nof gzip\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);

    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.gz")) {
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3",
                 results.get(5).toString());

    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test",
                 results.get(0).toString());
    assertEquals("splits[1][1]", "of gzip",
                 results.get(1).toString());
  }
 
Example 18
Source File: TestGroupedSplits.java    From tez with Apache License 2.0 4 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "is\ngzip\n");
  writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip,
      "one\nmore\nsplit\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat wrappedFormat = new TextInputFormat();
  wrappedFormat.configure(job);
  TezGroupedSplitsInputFormat<LongWritable , Text> format = 
      new TezGroupedSplitsInputFormat<LongWritable, Text>();
  format.setConf(job);
  format.setInputFormat(wrappedFormat);
  
  // TextInputFormat will produce 3 splits
  for (int j=1; j<=3; ++j) {
    format.setDesiredNumberOfSplits(j);
    InputSplit[] splits = format.getSplits(job, 100);
    if (j==1) {
      // j==1 covers single split corner case
      // and does not do grouping
      assertEquals("compressed splits == " + j, j, splits.length);
    }
    List<Text> results = new ArrayList<Text>();
    for (int i=0; i<splits.length; ++i) { 
      List<Text> read = readSplit(format, splits[i], job);
      results.addAll(read);
    }
    assertEquals("splits length", 11, results.size());

    final String[] firstList =
      {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
    final String[] secondList = {"is", "gzip"};
    final String[] thirdList = {"one", "more", "split"};
    String first = results.get(0).toString();
    int start = 0;
    switch (first.charAt(0)) {
    case 't':
      start = testResults(results, firstList, start);
      break;
    case 'i':
      start = testResults(results, secondList, start);
      break;
    case 'o':
      start = testResults(results, thirdList, start);
      break;
    default:
      Assert.fail("unexpected first token - " + first);
    }
  }
}
 
Example 19
Source File: TestConcatenatedCompressedInput.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
   * Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
   */
  @Test
  public void testMoreBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);

    System.out.println(COLOR_BR_MAGENTA +
      "testMoreBzip2() using non-native CBZip2InputStream (presumably)" +
      COLOR_NORMAL);

    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1);
    Path fnHDFS1  = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);

    // copy multiple-member test file to HDFS
    String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2);
    Path fnHDFS2  = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);

    FileInputFormat.setInputPaths(jobConf, workDir);

    // here's first pair of BlockDecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2567, in1.available());
    assertEquals("concat bytes available", 3056, in2.available());

/*
    // FIXME
    // The while-loop below dies at the beginning of the 2nd concatenated
    // member (after 17 lines successfully read) with:
    //
    //   java.io.IOException: bad block header
    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
    //   CBZip2InputStream.java:527)
    //
    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
    // simply commented out for now (and HADOOP-6852 filed).  If and when the
    // latter issue is resolved--perhaps by fixing an error here--this code
    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.

    CompressionInputStream cin2 = bzip2.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);
 */

    // test CBZip2InputStream with lots of different input-buffer sizes
    doMultipleBzip2BufferSizes(jobConf, false);

    // no native version of bzip2 codec (yet?)
    //doMultipleBzip2BufferSizes(jobConf, true);
  }
 
Example 20
Source File: TestFixedLengthInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
private void runRandomTests(CompressionCodec codec) throws IOException {
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  localFs.delete(workDir, true);
  Path file = new Path(workDir, fileName.toString());
  int seed = new Random().nextInt();
  LOG.info("Seed = " + seed);
  Random random = new Random(seed);
  int MAX_TESTS = 20;
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  for (int i = 0; i < MAX_TESTS; i++) {
    LOG.info("----------------------------------------------------------");
    // Maximum total records of 999
    int totalRecords = random.nextInt(999)+1;
    // Test an empty file
    if (i == 8) {
       totalRecords = 0;
    }
    // Maximum bytes in a record of 100K
    int recordLength = random.nextInt(1024*100)+1;
    // For the 11th test, force a record length of 1
    if (i == 10) {
      recordLength = 1;
    }
    // The total bytes in the test file
    int fileSize = (totalRecords * recordLength);
    LOG.info("totalRecords=" + totalRecords + " recordLength="
        + recordLength);
    // Create the job 
    JobConf job = new JobConf(defaultConf);
    if (codec != null) {
      ReflectionUtils.setConf(codec, job);
    }
    // Create the test file
    ArrayList<String> recordList
        = createFile(file, codec, recordLength, totalRecords);
    assertTrue(localFs.exists(file));
    //set the fixed length record length config property for the job
    FixedLengthInputFormat.setRecordLength(job, recordLength);

    int numSplits = 1;
    // Arbitrarily set number of splits.
    if (i > 0) {
      if (i == (MAX_TESTS-1)) {
        // Test a split size that is less than record len
        numSplits = (int)(fileSize/Math.floor(recordLength/2));
      } else {
        if (MAX_TESTS % i == 0) {
          // Let us create a split size that is forced to be 
          // smaller than the end file itself, (ensures 1+ splits)
          numSplits = fileSize/(fileSize - random.nextInt(fileSize));
        } else {
          // Just pick a random split size with no upper bound 
          numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE));
        }
      }
      LOG.info("Number of splits set to: " + numSplits);
    }

    // Setup the input path
    FileInputFormat.setInputPaths(job, workDir);
    // Try splitting the file in a variety of sizes
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.configure(job);
    InputSplit splits[] = format.getSplits(job, numSplits);
    LOG.info("Actual number of splits = " + splits.length);
    // Test combined split lengths = total file size
    long recordOffset = 0;
    int recordNumber = 0;
    for (InputSplit split : splits) {
      RecordReader<LongWritable, BytesWritable> reader = 
          format.getRecordReader(split, job, voidReporter);
      Class<?> clazz = reader.getClass();
      assertEquals("RecordReader class should be FixedLengthRecordReader:", 
          FixedLengthRecordReader.class, clazz);
      // Plow through the records in this split
      while (reader.next(key, value)) {
        assertEquals("Checking key", (long)(recordNumber*recordLength),
            key.get());
        String valueString =
            new String(value.getBytes(), 0, value.getLength());
        assertEquals("Checking record length:", recordLength,
            value.getLength());
        assertTrue("Checking for more records than expected:",
            recordNumber < totalRecords);
        String origRecord = recordList.get(recordNumber);
        assertEquals("Checking record content:", origRecord, valueString);
        recordNumber++;
      }
      reader.close();
    }
    assertEquals("Total original records should be total read records:",
        recordList.size(), recordNumber);
  }
}