org.apache.avro.mapreduce.AvroJob Java Examples
The following examples show how to use
org.apache.avro.mapreduce.AvroJob.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CombinedAvroKeyInputFormat.java From datafu with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") @Override public RecordReader<AvroKey<T>, NullWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException { Schema readerSchema = AvroJob.getInputKeySchema(context.getConfiguration()); if (null == readerSchema) { LOG.warn("Reader schema was not set. Use AvroJob.setInputKeySchema() if desired."); LOG.info("Using a reader schema equal to the writer schema."); } Object c = CombinedAvroKeyRecordReader.class; return new CombineFileRecordReader<AvroKey<T>, NullWritable>((CombineFileSplit) inputSplit, context, (Class<? extends RecordReader<AvroKey<T>, NullWritable>>)c); }
Example #2
Source File: TestMapReduceHBase.java From kite with Apache License 2.0 | 5 votes |
@Test @SuppressWarnings("deprecation") public void testJobEmptyView() throws Exception { Job job = new Job(HBaseTestUtils.getConf()); String datasetName = tableName + ".TestGenericEntity"; Dataset<GenericRecord> inputDataset = repo.create("default", "in", new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity).build()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(testGenericEntity) .build(); Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); DatasetKeyInputFormat.configure(job).readFrom(inputDataset); job.setMapperClass(AvroKeyWrapperMapper.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setMapOutputKeySchema(job, new Schema.Parser().parse(testGenericEntity)); job.setReducerClass(AvroKeyWrapperReducer.class); job.setOutputKeyClass(GenericData.Record.class); job.setOutputValueClass(Void.class); AvroJob.setOutputKeySchema(job, new Schema.Parser().parse(testGenericEntity)); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset); Assert.assertTrue(job.waitForCompletion(true)); }
Example #3
Source File: CQLMapper.java From aegisthus with Apache License 2.0 | 5 votes |
@Override protected void setup( Context context) throws IOException, InterruptedException { avroSchema = AvroJob.getOutputKeySchema(context.getConfiguration()); cfMetaData = CFMetadataUtility.initializeCfMetaData(context.getConfiguration()); cfDef = cfMetaData.getCfDef(); initBuilder(); /* This exporter assumes tables are composite, which should be true of all current schemas */ if (!cfDef.isComposite) throw new RuntimeException("Only can export composite CQL table schemas."); }
Example #4
Source File: SegmentPreprocessingMapper.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void setup(final Context context) { Configuration configuration = context.getConfiguration(); String tableName = configuration.get(JobConfigConstants.SEGMENT_TABLE_NAME); _isAppend = configuration.get(InternalConfigConstants.IS_APPEND).equalsIgnoreCase("true"); if (_isAppend) { // Get time column name _timeColumn = configuration.get(InternalConfigConstants.TIME_COLUMN_CONFIG); // Get sample time column value String timeColumnValue = configuration.get(InternalConfigConstants.TIME_COLUMN_VALUE); String pushFrequency = configuration.get(InternalConfigConstants.SEGMENT_PUSH_FREQUENCY); String timeType = configuration.get(InternalConfigConstants.SEGMENT_TIME_TYPE); String timeFormat = configuration.get(InternalConfigConstants.SEGMENT_TIME_FORMAT); DateTimeFormatSpec dateTimeFormatSpec; if (timeFormat.equals(DateTimeFieldSpec.TimeFormat.EPOCH.toString())) { dateTimeFormatSpec = new DateTimeFormatSpec(1, timeType, timeFormat); } else { dateTimeFormatSpec = new DateTimeFormatSpec(1, timeType, timeFormat, configuration.get(InternalConfigConstants.SEGMENT_TIME_SDF_PATTERN)); } _normalizedDateSegmentNameGenerator = new NormalizedDateSegmentNameGenerator(tableName, null, false, "APPEND", pushFrequency, dateTimeFormatSpec); _sampleNormalizedTimeColumnValue = _normalizedDateSegmentNameGenerator.getNormalizedDate(timeColumnValue); } String sortedColumn = configuration.get(InternalConfigConstants.SORTED_COLUMN_CONFIG); // Logging the configs for the mapper LOGGER.info("Sorted Column: " + sortedColumn); if (sortedColumn != null) { _sortedColumn = sortedColumn; } _outputKeySchema = AvroJob.getMapOutputKeySchema(configuration); _outputSchema = AvroJob.getMapOutputValueSchema(configuration); _enablePartitioning = Boolean.parseBoolean(configuration.get(InternalConfigConstants.ENABLE_PARTITIONING, "false")); }
Example #5
Source File: AvroKeyWithMetadataOutputFormat.java From datafu with Apache License 2.0 | 5 votes |
/** {@inheritDoc} */ @Override @SuppressWarnings("unchecked") public RecordWriter<AvroKey<T>, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException { // Get the writer schema. Schema writerSchema = AvroJob.getOutputKeySchema(context.getConfiguration()); if (null == writerSchema) { throw new IOException( "AvroKeyOutputFormat requires an output schema. Use AvroJob.setOutputKeySchema()."); } return mRecordWriterFactory.create( writerSchema, getCompressionCodec(context), getAvroFileOutputStream(context), context.getConfiguration()); }
Example #6
Source File: AvroHdfsFileSink.java From components with Apache License 2.0 | 5 votes |
@Override protected void configure(Job job, KV<AvroKey<IndexedRecord>, NullWritable> sample) { super.configure(job, sample); AvroKey<IndexedRecord> k = sample.getKey(); AvroJob.setOutputKeySchema(job, k.datum().getSchema()); FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.SNAPPY_CODEC); }
Example #7
Source File: CompactionAvroJobConfigurator.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override protected void configureSchema(Job job) throws IOException { Schema newestSchema = MRCompactorAvroKeyDedupJobRunner.getNewestSchemaFromSource(job, this.fs); if (newestSchema != null) { if (this.state.getPropAsBoolean(MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_AVRO_SINGLE_INPUT_SCHEMA, true)) { AvroJob.setInputKeySchema(job, newestSchema); } AvroJob.setMapOutputKeySchema(job, this.shouldDeduplicate ? getDedupKeySchema(newestSchema) : newestSchema); AvroJob.setMapOutputValueSchema(job, newestSchema); AvroJob.setOutputKeySchema(job, newestSchema); } }
Example #8
Source File: AvroKeyCombineFileRecordReader.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private static Schema getSchema(CombineFileSplit split, TaskAttemptContext cx, Integer idx) throws IOException { Schema schema = AvroJob.getInputKeySchema(cx.getConfiguration()); if (schema != null) { return schema; } Path path = split.getPath(idx); FileSystem fs = path.getFileSystem(cx.getConfiguration()); return AvroUtils.getSchemaFromDataFile(path, fs); }
Example #9
Source File: MRCompactorAvroKeyDedupJobRunner.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private void configureSchema(Job job) throws IOException { Schema newestSchema = getNewestSchemaFromSource(job, this.fs); if (this.useSingleInputSchema) { AvroJob.setInputKeySchema(job, newestSchema); } AvroJob.setMapOutputKeySchema(job, this.shouldDeduplicate ? getKeySchema(job, newestSchema) : newestSchema); AvroJob.setMapOutputValueSchema(job, newestSchema); AvroJob.setOutputKeySchema(job, newestSchema); }
Example #10
Source File: AvroKeyMapper.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override protected void setup(Context context) throws IOException, InterruptedException { this.keySchema = AvroJob.getMapOutputKeySchema(context.getConfiguration()); this.outKey = new AvroKey<>(); this.outKey.datum(new GenericData.Record(this.keySchema)); this.outValue = new AvroValue<>(); }
Example #11
Source File: ExportHBaseTableToAvro.java From HBase-ToHDFS with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out.println("ExportHBaseTableToAvro {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowKeyColumn.Optional}"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String rowKeyColumn = ""; if (args.length > 5) { rowKeyColumn = args[5]; } Job job = Job.getInstance(); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.setJarByClass(ExportHBaseTableToAvro.class); job.setJobName("ExportHBaseTableToAvro "); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob(table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroKeyOutputFormat.setOutputPath(job, new Path(outputPath)); Schema.Parser parser = new Schema.Parser(); FileSystem fs = FileSystem.get(job.getConfiguration()); AvroJob.setOutputKeySchema(job, parser.parse(fs.open(new Path(schemaFilePath)))); if (compressionCodec.equals("snappy")) { AvroKeyOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { AvroKeyOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { // nothing } job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); }
Example #12
Source File: CombinedAvroKeyInputFormat.java From datafu with Apache License 2.0 | 4 votes |
public CombinedAvroKeyRecordReader(CombineFileSplit inputSplit, TaskAttemptContext context, Integer idx) { super(AvroJob.getInputKeySchema(context.getConfiguration())); this.inputSplit = inputSplit; this.idx = idx; }
Example #13
Source File: AggregationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(AggregationPhaseJob.class); FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(AggregationMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(AggregationReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, avroSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName()); LOGGER.info("Num Reducers : {}", numReducers); if (StringUtils.isNotBlank(numReducers)) { job.setNumReduceTasks(Integer.valueOf(numReducers)); LOGGER.info("Setting num reducers {}", job.getNumReduceTasks()); } job.waitForCompletion(true); Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); if (counter.getValue() == 0) { throw new IllegalStateException("No input records in " + inputPathDir); } counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); for (String metric : thirdeyeConfig.getMetricNames()) { counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); } return job; }
Example #14
Source File: DerivedColumnTransformationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(DerivedColumnTransformationPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Input Path String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Topk path String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH); LOGGER.info("Topk path : " + topkPath); // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); // New schema Schema outputSchema = newSchema(thirdeyeConfig); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString()); // Map config job.setMapperClass(DerivedColumnTransformationPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, outputSchema); LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema); job.setNumReduceTasks(0); job.waitForCompletion(true); return job; }
Example #15
Source File: OSMRunner.java From geowave with Apache License 2.0 | 4 votes |
@Override public int run(final String[] args) throws Exception { final Configuration conf = getConf(); conf.set("tableName", ingestOptions.getQualifiedTableName()); conf.set("osmVisibility", ingestOptions.getVisibilityOptions().getVisibility()); // job settings final Job job = Job.getInstance(conf, ingestOptions.getJobName()); job.setJarByClass(OSMRunner.class); switch (ingestOptions.getMapperType()) { case "NODE": { configureSchema(AvroNode.getClassSchema()); inputAvroFile = ingestOptions.getNodesBasePath(); job.setMapperClass(OSMNodeMapper.class); break; } case "WAY": { configureSchema(AvroWay.getClassSchema()); inputAvroFile = ingestOptions.getWaysBasePath(); job.setMapperClass(OSMWayMapper.class); break; } case "RELATION": { configureSchema(AvroRelation.getClassSchema()); inputAvroFile = ingestOptions.getRelationsBasePath(); job.setMapperClass(OSMRelationMapper.class); break; } default: break; } if ((avroSchema == null) || (inputAvroFile == null)) { throw new MissingArgumentException( "argument for mapper type must be one of: NODE, WAY, or RELATION"); } enableLocalityGroups(ingestOptions); // input format job.setInputFormatClass(AvroKeyInputFormat.class); FileInputFormat.setInputPaths(job, inputAvroFile); AvroJob.setInputKeySchema(job, avroSchema); // mappper job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); job.setOutputFormatClass(AccumuloOutputFormat.class); AccumuloOutputFormat.setConnectorInfo( job, accumuloOptions.getUser(), new PasswordToken(accumuloOptions.getPassword())); AccumuloOutputFormat.setCreateTables(job, true); AccumuloOutputFormat.setDefaultTableName(job, ingestOptions.getQualifiedTableName()); AccumuloOutputFormat.setZooKeeperInstance( job, new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts( accumuloOptions.getZookeeper())); // reducer job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : -1; }
Example #16
Source File: SSTableExport.java From aegisthus with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SSTableExport.class); CommandLine cl = getOptions(args); if (cl == null) { return 1; } // Check all of the paths and load the sstable version from the input filenames List<Path> paths = Lists.newArrayList(); if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) { for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) { checkVersionFromFilename(input); paths.add(new Path(input)); } } if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) { paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR))); } String avroSchemaString = getAvroSchema(cl.getOptionValue(Feature.CMD_ARG_AVRO_SCHEMA_FILE), job.getConfiguration()); Schema avroSchema = new Schema.Parser().parse(avroSchemaString); // At this point we have the version of sstable that we can use for this run job.getConfiguration().set(Aegisthus.Feature.CONF_SSTABLE_VERSION, version.toString()); if (job.getConfiguration().get(Aegisthus.Feature.CONF_CQL_SCHEMA) != null) { setConfigurationFromCql(job.getConfiguration()); } job.setInputFormatClass(AegisthusInputFormat.class); job.setMapperClass(CQLMapper.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroJob.setOutputKeySchema(job, avroSchema); // Map-only job job.setNumReduceTasks(0); TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); FileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR))); job.submit(); System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
Example #17
Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(ReplicatedJoin.UserOptions.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path usersPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.USERS)); Path outputPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(BloomFilterCreator.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, AvroBytesRecord.SCHEMA); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, SnappyCodec.class.getName()); job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, usersPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }