org.apache.hadoop.io.compress.SnappyCodec Java Exaples

Source File: CommonSnappyShim.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

/**
 * Gets an OutputStream that uses the snappy codec and wraps the supplied base output stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param out the base output stream to wrap around
 * @return a OutputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 */
public OutputStream getSnappyOutputStream( int bufferSize, OutputStream out ) throws Exception {
  if ( !isHadoopSnappyAvailable() ) {
    throw new Exception( "Hadoop-snappy does not seem to be available" );
  }

  ClassLoader cl = Thread.currentThread().getContextClassLoader();
  Thread.currentThread().setContextClassLoader( getClass().getClassLoader() );
  try {
    SnappyCodec c = new SnappyCodec();
    Configuration newConf = new Configuration();
    newConf.set( IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize );
    c.setConf( newConf );
    return c.createOutputStream( out );
  } finally {
    Thread.currentThread().setContextClassLoader( cl );
  }
}

Source File: CommonSnappyShim.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

/**
 * Gets an InputStream that uses the snappy codec and wraps the supplied base input stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param in  the base input stream to wrap around
 * @return an InputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 */
public InputStream getSnappyInputStream( int bufferSize, InputStream in ) throws Exception {
  if ( !isHadoopSnappyAvailable() ) {
    throw new Exception( "Hadoop-snappy does not seem to be available" );
  }

  ClassLoader cl = Thread.currentThread().getContextClassLoader();
  Thread.currentThread().setContextClassLoader( getClass().getClassLoader() );
  try {
    SnappyCodec c = new SnappyCodec();
    Configuration newConf = new Configuration();
    newConf.set( IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize );
    c.setConf( newConf );
    return c.createInputStream( in );
  } finally {
    Thread.currentThread().setContextClassLoader( cl );
  }
}

Source File: TestNativeCodeLoader.java From hadoop with Apache License 2.0

6 votes

@Test
public void testNativeCodeLoaded() {
  if (requireTestJni() == false) {
    LOG.info("TestNativeCodeLoader: libhadoop.so testing is not required.");
    return;
  }
  if (!NativeCodeLoader.isNativeCodeLoaded()) {
    fail("TestNativeCodeLoader: libhadoop.so testing was required, but " +
        "libhadoop.so was not loaded.");
  }
  assertFalse(NativeCodeLoader.getLibraryName().isEmpty());
  // library names are depended on platform and build envs
  // so just check names are available
  assertFalse(ZlibFactory.getLibraryName().isEmpty());
  if (NativeCodeLoader.buildSupportsSnappy()) {
    assertFalse(SnappyCodec.getLibraryName().isEmpty());
  }
  if (NativeCodeLoader.buildSupportsOpenssl()) {
    assertFalse(OpensslCipher.getLibraryName().isEmpty());
  }
  assertFalse(Lz4Codec.getLibraryName().isEmpty());
  LOG.info("TestNativeCodeLoader: libhadoop.so is loaded.");
}

Source File: CsvBlurDriverTest.java From incubator-retired-blur with Apache License 2.0

6 votes

@Test
public void testCsvBlurDriverTest3() throws Exception {
  Configuration configurationSetup = new Configuration();
  ControllerPool controllerPool = new CsvBlurDriver.ControllerPool() {
    @Override
    public Iface getClient(String controllerConnectionStr) {
      return getMockIface();
    }
  };
  AtomicReference<Callable<Void>> ref = new AtomicReference<Callable<Void>>();
  Job job = CsvBlurDriver.setupJob(configurationSetup, controllerPool, ref, "-c", "host:40010", "-d", "family1",
      "col1", "col2", "-d", "family2", "col3", "col4", "-t", "table1", "-i", _path1.toString(), "-i",
      _path2.toString(), "-S", "-C", "1000000", "2000000", "-p", "SNAPPY");
  assertNotNull(job);
  Configuration configuration = job.getConfiguration();
  TableDescriptor tableDescriptor = BlurOutputFormat.getTableDescriptor(configuration);
  assertEquals(tableDescriptor.getName(), "table1");
  Collection<String> inputs = configuration.getStringCollection("mapred.input.dir");
  assertEquals(2, inputs.size());
  Map<String, List<String>> familyAndColumnNameMap = CsvBlurMapper.getFamilyAndColumnNameMap(configuration);
  assertEquals(2, familyAndColumnNameMap.size());
  assertEquals("true", configuration.get(CsvBlurDriver.MAPRED_COMPRESS_MAP_OUTPUT));
  assertEquals(SnappyCodec.class.getName(), configuration.get(CsvBlurDriver.MAPRED_MAP_OUTPUT_COMPRESSION_CODEC));
}

Source File: TestNativeCodeLoader.java From big-c with Apache License 2.0

6 votes

@Test
public void testNativeCodeLoaded() {
  if (requireTestJni() == false) {
    LOG.info("TestNativeCodeLoader: libhadoop.so testing is not required.");
    return;
  }
  if (!NativeCodeLoader.isNativeCodeLoaded()) {
    fail("TestNativeCodeLoader: libhadoop.so testing was required, but " +
        "libhadoop.so was not loaded.");
  }
  assertFalse(NativeCodeLoader.getLibraryName().isEmpty());
  // library names are depended on platform and build envs
  // so just check names are available
  assertFalse(ZlibFactory.getLibraryName().isEmpty());
  if (NativeCodeLoader.buildSupportsSnappy()) {
    assertFalse(SnappyCodec.getLibraryName().isEmpty());
  }
  if (NativeCodeLoader.buildSupportsOpenssl()) {
    assertFalse(OpensslCipher.getLibraryName().isEmpty());
  }
  assertFalse(Lz4Codec.getLibraryName().isEmpty());
  LOG.info("TestNativeCodeLoader: libhadoop.so is loaded.");
}

Source File: HadoopWordCount2.java From ignite with Apache License 2.0

6 votes

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param job Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(Job job, boolean setMapper, boolean setCombiner, boolean setReducer,
        boolean outputCompression) {
    if (setMapper) {
        job.setMapperClass(HadoopWordCount2Mapper.class);
        job.setInputFormatClass(TextInputFormat.class);
    }

    if (setCombiner)
        job.setCombinerClass(HadoopWordCount2Combiner.class);

    if (setReducer) {
        job.setReducerClass(HadoopWordCount2Reducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
    }

    if (outputCompression) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        SequenceFileOutputFormat.setCompressOutput(job, true);

        job.getConfiguration().set(FileOutputFormat.COMPRESS_CODEC, SnappyCodec.class.getName());
    }
}

Source File: AbstractFileOutputOperatorTest.java From attic-apex-malhar with Apache License 2.0

5 votes

private boolean checkNativeSnappy()
{
  try {
    SnappyCodec.checkNativeCodeLoaded();
  } catch (UnsatisfiedLinkError u) {
    LOG.error("WARNING: Skipping Snappy compression test since native libraries were not found.");
    return true;
  } catch (RuntimeException e) {
    LOG.error("WARNING: Skipping Snappy compression test since native libraries were not found.");
    return true;
  }
  return false;
}

Source File: SnappyShimImpl.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

/**
 * Tests whether hadoop-snappy (not to be confused with other java-based snappy implementations such as jsnappy or
 * snappy-java) plus the native snappy libraries are available.
 *
 * @return true if hadoop-snappy is available on the classpath
 */
public boolean isHadoopSnappyAvailable() {
  ClassLoader cl = Thread.currentThread().getContextClassLoader();
  Thread.currentThread().setContextClassLoader( getClass().getClassLoader() );
  try {
    return SnappyCodec.isNativeCodeLoaded();
  } catch ( Throwable t ) {
    return false;
  } finally {
    Thread.currentThread().setContextClassLoader( cl );
  }
}

Source File: JsonORCFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

/**
 * Used for returning the compression kind used in ORC
 *
 * @param codec
 * @return
 */
private CompressionKind resolveCompression(CompressionCodec codec) {
    if (codec instanceof Lz4Codec)
        return CompressionKind.LZ4;
    else if (codec instanceof SnappyCodec)
        return CompressionKind.SNAPPY;
    // although GZip and ZLIB are not same thing
    // there is no better named codec for this case,
    // use hadoop Gzip codec to enable ORC ZLIB compression
    else if (codec instanceof GzipCodec)
        return CompressionKind.ZLIB;
    else
        return CompressionKind.NONE;
}

Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0

5 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(BloomFilterCreator.class);

  job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString());
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setInputFormat(KeyValueTextInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(NullWritable.class);
  job.setMapOutputValueClass(BloomFilter.class);

  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(BloomFilter.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}

Source File: AbstractFileOutputOperatorTest.java From attic-apex-malhar with Apache License 2.0

5 votes

@Test
public void testSnappyCompressionSimple() throws IOException
{
  if (checkNativeSnappy()) {
    return;
  }

  File snappyFile = new File(testMeta.getDir(), "snappyTestFile.snappy");

  BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(snappyFile));
  Configuration conf = new Configuration();
  CompressionCodec codec = (CompressionCodec)ReflectionUtils.newInstance(SnappyCodec.class, conf);
  FilterStreamCodec.SnappyFilterStream filterStream = new FilterStreamCodec.SnappyFilterStream(
      codec.createOutputStream(os));

  int ONE_MB = 1024 * 1024;

  String testStr = "TestSnap-16bytes";
  for (int i = 0; i < ONE_MB; i++) { // write 16 MBs
    filterStream.write(testStr.getBytes());
  }
  filterStream.flush();
  filterStream.close();

  CompressionInputStream is = codec.createInputStream(new FileInputStream(snappyFile));

  byte[] recovered = new byte[testStr.length()];
  int bytesRead = is.read(recovered);
  is.close();
  assertEquals(testStr, new String(recovered));
}

Source File: FilterStreamCodec.java From attic-apex-malhar with Apache License 2.0

5 votes

public SnappyFilterStreamContext(OutputStream outputStream) throws IOException
{
  SnappyCodec codec = new SnappyCodec();
  codec.setConf(new Configuration());
  try {
    filterStream = new SnappyFilterStream(
        codec.createOutputStream(outputStream, new SnappyCompressor(bufferSize)));
  } catch (IOException e) {
    throw e;
  }
}

Source File: Phase2ExactMatchDeDuplication.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: AbstractHadoopProcessor.java From localization_nifi with Apache License 2.0

5 votes

@Override
public String toString() {
    switch (this) {
        case NONE: return "NONE";
        case DEFAULT: return DefaultCodec.class.getName();
        case BZIP: return BZip2Codec.class.getName();
        case GZIP: return GzipCodec.class.getName();
        case LZ4: return Lz4Codec.class.getName();
        case SNAPPY: return SnappyCodec.class.getName();
        case AUTOMATIC: return "Automatically Detected";
    }
    return null;
}

Source File: HdfsSink2.java From sylph with Apache License 2.0

5 votes

public HdfsSink2(Hdfs2SinkConfig config)
        throws ClassNotFoundException
{
    this.batchSize = config.getBatchBufferSize();
    this.writerDir = config.getWriteDir();
    switch (config.getZipType().trim().toLowerCase()) {
        case "lzo":
            codecClass = (Class<? extends CompressionCodec>) Class.forName("com.hadoop.compression.lzo.LzopCodec");
            break;
        case "lz4":
            codecClass = Lz4Codec.class;
            break;
        case "snappy":
            codecClass = SnappyCodec.class;
            break;
        case "gzip":
            codecClass = GzipCodec.class;
            break;
        case "bzip2":
            codecClass = BZip2Codec.class;
            break;
        case "default":
            codecClass = DefaultCodec.class;
            break;
        default:
            codecClass = NoneCodec.class;
    }
}

Source File: TestSnappyCompressorDecompressor.java From hadoop with Apache License 2.0

5 votes

@Test
public void testSnappyDirectBlockCompression() {
  int[] size = { 4 * 1024, 64 * 1024, 128 * 1024, 1024 * 1024 };    
  assumeTrue(SnappyCodec.isNativeCodeLoaded());
  try {
    for (int i = 0; i < size.length; i++) {
      compressDecompressLoop(size[i]);
    }
  } catch (IOException ex) {
    fail("testSnappyDirectBlockCompression ex !!!" + ex);
  }
}

Source File: TestSnappyCompressorDecompressor.java From big-c with Apache License 2.0

5 votes

@Test
public void testSnappyDirectBlockCompression() {
  int[] size = { 4 * 1024, 64 * 1024, 128 * 1024, 1024 * 1024 };    
  assumeTrue(SnappyCodec.isNativeCodeLoaded());
  try {
    for (int i = 0; i < size.length; i++) {
      compressDecompressLoop(size[i]);
    }
  } catch (IOException ex) {
    fail("testSnappyDirectBlockCompression ex !!!" + ex);
  }
}

Source File: Phase1FullJob.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: TestSnappyCompressorDecompressor.java From big-c with Apache License 2.0

4 votes

@Before
public void before() {
  assumeTrue(SnappyCodec.isNativeCodeLoaded());
}

Source File: DBImportMapReduce.java From hiped2 with Apache License 2.0

4 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.OutputFileOption.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path output = new Path(cli.getArgValueAsString(CliCommonOpts.OutputFileOption.OUTPUT));

  Configuration conf = super.getConf();

  DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",
      "jdbc:mysql://localhost/sqoop_test" +
          "?user=hip_sqoop_user&password=password");

  JobConf job = new JobConf(conf);
  job.setJarByClass(DBImportMapReduce.class);

  job.setInputFormat(DBInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);
  AvroJob.setOutputSchema(job, Stock.SCHEMA$);
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setMapperClass(Map.class);

  job.setNumMapTasks(4);
  job.setNumReduceTasks(0);

  job.setMapOutputKeyClass(AvroWrapper.class);
  job.setMapOutputValueClass(NullWritable.class);

  job.setOutputKeyClass(AvroWrapper.class);
  job.setOutputValueClass(NullWritable.class);

  FileOutputFormat.setOutputPath(job, output);

  DBInputFormat.setInput(
      job,
      StockDbWritable.class,
      "select * from stocks",
      "SELECT COUNT(id) FROM stocks");

  RunningJob runningJob = JobClient.runJob(job);

  return runningJob.isSuccessful() ? 0 : 1;
}

Source File: NativeLibraryChecker.java From big-c with Apache License 2.0

4 votes

/**
 * A tool to test native library availability, 
 */
public static void main(String[] args) {
  String usage = "NativeLibraryChecker [-a|-h]\n"
      + "  -a  use -a to check all libraries are available\n"
      + "      by default just check hadoop library (and\n"
      + "      winutils.exe on Windows OS) is available\n"
      + "      exit with error code 1 if check failed\n"
      + "  -h  print this message\n";
  if (args.length > 1 ||
      (args.length == 1 &&
          !(args[0].equals("-a") || args[0].equals("-h")))) {
    System.err.println(usage);
    ExitUtil.terminate(1);
  }
  boolean checkAll = false;
  if (args.length == 1) {
    if (args[0].equals("-h")) {
      System.out.println(usage);
      return;
    }
    checkAll = true;
  }
  Configuration conf = new Configuration();
  boolean nativeHadoopLoaded = NativeCodeLoader.isNativeCodeLoaded();
  boolean zlibLoaded = false;
  boolean snappyLoaded = false;
  // lz4 is linked within libhadoop
  boolean lz4Loaded = nativeHadoopLoaded;
  boolean bzip2Loaded = Bzip2Factory.isNativeBzip2Loaded(conf);
  boolean openSslLoaded = false;
  boolean winutilsExists = false;

  String openSslDetail = "";
  String hadoopLibraryName = "";
  String zlibLibraryName = "";
  String snappyLibraryName = "";
  String lz4LibraryName = "";
  String bzip2LibraryName = "";
  String winutilsPath = null;

  if (nativeHadoopLoaded) {
    hadoopLibraryName = NativeCodeLoader.getLibraryName();
    zlibLoaded = ZlibFactory.isNativeZlibLoaded(conf);
    if (zlibLoaded) {
      zlibLibraryName = ZlibFactory.getLibraryName();
    }
    snappyLoaded = NativeCodeLoader.buildSupportsSnappy() &&
        SnappyCodec.isNativeCodeLoaded();
    if (snappyLoaded && NativeCodeLoader.buildSupportsSnappy()) {
      snappyLibraryName = SnappyCodec.getLibraryName();
    }
    if (OpensslCipher.getLoadingFailureReason() != null) {
      openSslDetail = OpensslCipher.getLoadingFailureReason();
      openSslLoaded = false;
    } else {
      openSslDetail = OpensslCipher.getLibraryName();
      openSslLoaded = true;
    }
    if (lz4Loaded) {
      lz4LibraryName = Lz4Codec.getLibraryName();
    }
    if (bzip2Loaded) {
      bzip2LibraryName = Bzip2Factory.getLibraryName(conf);
    }
  }

  // winutils.exe is required on Windows
  winutilsPath = Shell.getWinUtilsPath();
  if (winutilsPath != null) {
    winutilsExists = true;
  } else {
    winutilsPath = "";
  }

  System.out.println("Native library checking:");
  System.out.printf("hadoop:  %b %s%n", nativeHadoopLoaded, hadoopLibraryName);
  System.out.printf("zlib:    %b %s%n", zlibLoaded, zlibLibraryName);
  System.out.printf("snappy:  %b %s%n", snappyLoaded, snappyLibraryName);
  System.out.printf("lz4:     %b %s%n", lz4Loaded, lz4LibraryName);
  System.out.printf("bzip2:   %b %s%n", bzip2Loaded, bzip2LibraryName);
  System.out.printf("openssl: %b %s%n", openSslLoaded, openSslDetail);
  if (Shell.WINDOWS) {
    System.out.printf("winutils: %b %s%n", winutilsExists, winutilsPath);
  }

  if ((!nativeHadoopLoaded) || (Shell.WINDOWS && (!winutilsExists)) ||
      (checkAll && !(zlibLoaded && snappyLoaded && lz4Loaded && bzip2Loaded))) {
    // return 1 to indicated check failed
    ExitUtil.terminate(1);
  }
}

Source File: TestSnappyCompressorDecompressor.java From hadoop with Apache License 2.0

4 votes

@Before
public void before() {
  assumeTrue(SnappyCodec.isNativeCodeLoaded());
}

Source File: ExportHBaseTableToAvro.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
  if (args.length == 0) {
    System.out.println("ExportHBaseTableToAvro {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowKeyColumn.Optional}");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String rowKeyColumn = "";
  
  if (args.length > 5) {
    rowKeyColumn = args[5];
  }

  Job job = Job.getInstance();

  HBaseConfiguration.addHbaseResources(job.getConfiguration());

  job.setJarByClass(ExportHBaseTableToAvro.class);
  job.setJobName("ExportHBaseTableToAvro ");

  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  
  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(AvroKeyOutputFormat.class);
  AvroKeyOutputFormat.setOutputPath(job, new Path(outputPath));

  Schema.Parser parser = new Schema.Parser();

  FileSystem fs = FileSystem.get(job.getConfiguration());

  AvroJob.setOutputKeySchema(job, parser.parse(fs.open(new Path(schemaFilePath))));

  if (compressionCodec.equals("snappy")) {
    AvroKeyOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    AvroKeyOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    // nothing
  }

  job.setNumReduceTasks(0);

  boolean b = job.waitForCompletion(true);
}

Source File: ExportHBaseTableToDelimiteredSeq.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException {
 if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String delimiter = args[5];

  String rowKeyColumn = "";
  if (args.length > 6) {
    rowKeyColumn = args[6];
  }
  
  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  
  HBaseConfiguration.addHbaseResources(job.getConfiguration());
  
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
  job.getConfiguration().set(DELIMITER_CONF, delimiter);

  job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
  job.setJobName("ExportHBaseTableToDelimiteredSeq ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(SequenceFileOutputFormat.class); 
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  
  if (compressionCodec.equals("snappy")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    //nothing
  }
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  
  job.setNumReduceTasks(0);
  
  boolean b = job.waitForCompletion(true);
}

Source File: ExportHBaseTableToParquet.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
  if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToParquet {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowkey.column.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];

  String rowKeyColumn = "";
  if (args.length > 5) {
    rowKeyColumn = args[5];
  }

  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);

  HBaseConfiguration.addHbaseResources(job.getConfiguration());

  job.setJarByClass(ExportHBaseTableToParquet.class);
  job.setJobName("ExportHBaseTableToParquet ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  AvroParquetOutputFormat.setOutputPath(job, new Path(outputPath));

  Schema.Parser parser = new Schema.Parser();

  FileSystem fs = FileSystem.get(job.getConfiguration());
  AvroParquetOutputFormat.setSchema(job, parser.parse(fs.open(new Path(schemaFilePath))));

  if (compressionCodec.equals("snappy")) {
    AvroParquetOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    AvroParquetOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    // nothing
  }

  job.setNumReduceTasks(0);

  boolean b = job.waitForCompletion(true);
}

Source File: NativeLibraryChecker.java From hadoop with Apache License 2.0

4 votes

/**
 * A tool to test native library availability, 
 */
public static void main(String[] args) {
  String usage = "NativeLibraryChecker [-a|-h]\n"
      + "  -a  use -a to check all libraries are available\n"
      + "      by default just check hadoop library (and\n"
      + "      winutils.exe on Windows OS) is available\n"
      + "      exit with error code 1 if check failed\n"
      + "  -h  print this message\n";
  if (args.length > 1 ||
      (args.length == 1 &&
          !(args[0].equals("-a") || args[0].equals("-h")))) {
    System.err.println(usage);
    ExitUtil.terminate(1);
  }
  boolean checkAll = false;
  if (args.length == 1) {
    if (args[0].equals("-h")) {
      System.out.println(usage);
      return;
    }
    checkAll = true;
  }
  Configuration conf = new Configuration();
  boolean nativeHadoopLoaded = NativeCodeLoader.isNativeCodeLoaded();
  boolean zlibLoaded = false;
  boolean snappyLoaded = false;
  // lz4 is linked within libhadoop
  boolean lz4Loaded = nativeHadoopLoaded;
  boolean bzip2Loaded = Bzip2Factory.isNativeBzip2Loaded(conf);
  boolean openSslLoaded = false;
  boolean winutilsExists = false;

  String openSslDetail = "";
  String hadoopLibraryName = "";
  String zlibLibraryName = "";
  String snappyLibraryName = "";
  String lz4LibraryName = "";
  String bzip2LibraryName = "";
  String winutilsPath = null;

  if (nativeHadoopLoaded) {
    hadoopLibraryName = NativeCodeLoader.getLibraryName();
    zlibLoaded = ZlibFactory.isNativeZlibLoaded(conf);
    if (zlibLoaded) {
      zlibLibraryName = ZlibFactory.getLibraryName();
    }
    snappyLoaded = NativeCodeLoader.buildSupportsSnappy() &&
        SnappyCodec.isNativeCodeLoaded();
    if (snappyLoaded && NativeCodeLoader.buildSupportsSnappy()) {
      snappyLibraryName = SnappyCodec.getLibraryName();
    }
    if (OpensslCipher.getLoadingFailureReason() != null) {
      openSslDetail = OpensslCipher.getLoadingFailureReason();
      openSslLoaded = false;
    } else {
      openSslDetail = OpensslCipher.getLibraryName();
      openSslLoaded = true;
    }
    if (lz4Loaded) {
      lz4LibraryName = Lz4Codec.getLibraryName();
    }
    if (bzip2Loaded) {
      bzip2LibraryName = Bzip2Factory.getLibraryName(conf);
    }
  }

  // winutils.exe is required on Windows
  winutilsPath = Shell.getWinUtilsPath();
  if (winutilsPath != null) {
    winutilsExists = true;
  } else {
    winutilsPath = "";
  }

  System.out.println("Native library checking:");
  System.out.printf("hadoop:  %b %s%n", nativeHadoopLoaded, hadoopLibraryName);
  System.out.printf("zlib:    %b %s%n", zlibLoaded, zlibLibraryName);
  System.out.printf("snappy:  %b %s%n", snappyLoaded, snappyLibraryName);
  System.out.printf("lz4:     %b %s%n", lz4Loaded, lz4LibraryName);
  System.out.printf("bzip2:   %b %s%n", bzip2Loaded, bzip2LibraryName);
  System.out.printf("openssl: %b %s%n", openSslLoaded, openSslDetail);
  if (Shell.WINDOWS) {
    System.out.printf("winutils: %b %s%n", winutilsExists, winutilsPath);
  }

  if ((!nativeHadoopLoaded) || (Shell.WINDOWS && (!winutilsExists)) ||
      (checkAll && !(zlibLoaded && snappyLoaded && lz4Loaded && bzip2Loaded))) {
    // return 1 to indicated check failed
    ExitUtil.terminate(1);
  }
}

Source File: RcFileTester.java From presto with Apache License 2.0

4 votes

@Override
Optional<String> getCodecName()
{
    return Optional.of(SnappyCodec.class.getName());
}

Source File: AvroMixedMapReduce.java From hiped2 with Apache License 2.0

3 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {


  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(AvroMixedMapReduce.class);

  job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString());
  job.set(AvroJob.OUTPUT_SCHEMA, StockAvg.SCHEMA$.toString());
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setInputFormat(AvroInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}

Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0

3 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(ReplicatedJoin.UserOptions.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path usersPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.USERS));
  Path outputPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);

  job.setJarByClass(BloomFilterCreator.class);
  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  AvroJob.setOutputKeySchema(job, AvroBytesRecord.SCHEMA);
  job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setOutputFormatClass(AvroKeyOutputFormat.class);

  job.setMapOutputKeyClass(NullWritable.class);
  job.setMapOutputValueClass(BloomFilter.class);

  FileInputFormat.setInputPaths(job, usersPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.setNumReduceTasks(1);

  return job.waitForCompletion(true) ? 0 : 1;
}

org.apache.hadoop.io.compress.SnappyCodec Java Examples