parquet.avro.AvroParquetOutputFormat Java Examples
The following examples show how to use
parquet.avro.AvroParquetOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException { // Set up the Hadoop Input Format Job job = Job.getInstance(); // Set up Hadoop Output Format HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job); FileOutputFormat.setOutputPath(job, new Path(outputPath)); AvroParquetOutputFormat.setSchema(job, Person.getClassSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetOutputFormat.setEnableDictionary(job, true); // Output & Execute data.output(hadoopOutputFormat); }
Example #2
Source File: AvroParquetMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroParquetMapReduce.class); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, inputPath); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$); return job.waitForCompletion(true) ? 0 : 1; }
Example #3
Source File: AvroGenericParquetMapReduce.java From hiped2 with Apache License 2.0 | 4 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroGenericParquetMapReduce.class); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, inputPath); // force Avro to supply us GenericRecord objects in the mapper by mutating the // schema and changing the class name // Schema schema = Schema.createRecord("foobar", Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false); List<Schema.Field> fields = Lists.newArrayList(); for (Schema.Field field : Stock.SCHEMA$.getFields()) { fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue(), field.order())); } schema.setFields(fields); AvroParquetInputFormat.setAvroReadSchema(job, schema); job.setMapperClass(Map.class); job.setOutputKeyClass(Void.class); job.setOutputValueClass(GenericRecord.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, avroSchema); job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : 1; }
Example #4
Source File: AvroProjectionParquetMapReduce.java From hiped2 with Apache License 2.0 | 4 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroProjectionParquetMapReduce.class); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, inputPath); // predicate pushdown AvroParquetInputFormat.setUnboundRecordFilter(job, GoogleStockFilter.class); // projection pushdown Schema projection = Schema.createRecord(Stock.SCHEMA$.getName(), Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false); List<Schema.Field> fields = Lists.newArrayList(); for (Schema.Field field : Stock.SCHEMA$.getFields()) { if ("symbol".equals(field.name()) || "open".equals(field.name())) { fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue(), field.order())); } } projection.setFields(fields); AvroParquetInputFormat.setRequestedProjection(job, projection); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$); return job.waitForCompletion(true) ? 0 : 1; }
Example #5
Source File: ExportHBaseTableToParquet.java From HBase-ToHDFS with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out .println("ExportHBaseTableToParquet {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowkey.column.optional"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String rowKeyColumn = ""; if (args.length > 5) { rowKeyColumn = args[5]; } Job job = Job.getInstance(); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.setJarByClass(ExportHBaseTableToParquet.class); job.setJobName("ExportHBaseTableToParquet "); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob(table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(AvroParquetOutputFormat.class); AvroParquetOutputFormat.setOutputPath(job, new Path(outputPath)); Schema.Parser parser = new Schema.Parser(); FileSystem fs = FileSystem.get(job.getConfiguration()); AvroParquetOutputFormat.setSchema(job, parser.parse(fs.open(new Path(schemaFilePath)))); if (compressionCodec.equals("snappy")) { AvroParquetOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { AvroParquetOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { // nothing } job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); }