org.apache.hadoop.mapreduce.lib.output.MultipleOutputs Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.MultipleOutputs. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ConvergeCuboidDataReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    mos = new MultipleOutputs(context);

    String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME);
    String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeSegment cubeSegment = cube.getSegmentById(segmentID);
    CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment);

    this.enableSharding = oldSegment.isEnableSharding();
    this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
}
 
Example #2
Source File: ConvergeCuboidDataReducer.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    mos = new MultipleOutputs(context);

    String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME);
    String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeSegment cubeSegment = cube.getSegmentById(segmentID);
    CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment);

    this.enableSharding = oldSegment.isEnableSharding();
    this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
}
 
Example #3
Source File: WARCWriterReducerClass.java    From dkpro-c4corpus with Apache License 2.0 6 votes vote down vote up
/**
 * Writes single WARCWritable to the output with specific output file prefix
 *
 * @param warcWritable    warc record
 * @param multipleOutputs output
 * @throws IOException          exception
 * @throws InterruptedException exception
 */
// TODO move somewhere else?
public static void writeSingleWARCWritableToOutput(WARCWritable warcWritable,
        MultipleOutputs<NullWritable, WARCWritable> multipleOutputs)
        throws IOException, InterruptedException
{
    WARCRecord.Header header = warcWritable.getRecord().getHeader();
    String license = header.getField(WARCRecord.WARCRecordFieldConstants.LICENSE);
    String language = header.getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE);
    String noBoilerplate = header
            .getField(WARCRecord.WARCRecordFieldConstants.NO_BOILERPLATE);
    String minimalHtml = header.getField(WARCRecord.WARCRecordFieldConstants.MINIMAL_HTML);

    // set the file name prefix
    String fileName = createOutputFilePrefix(license, language, noBoilerplate, minimalHtml);

    // bottleneck of single reducer for all "Lic_none_Lang_en" pages (majority of Web)
    //        if ("en".equals(language) && LicenseDetector.NO_LICENCE.equals(license)) {
    //            long simHash = Long
    //                    .valueOf(header.getField(WARCRecord.WARCRecordFieldConstants.SIMHASH));
    //            int binNumber = getBinNumberFromSimHash(simHash);
    //            fileName = createOutputFilePrefix(license, language, noBoilerplate);
    //        }

    multipleOutputs.write(NullWritable.get(), warcWritable, fileName);
}
 
Example #4
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 6 votes vote down vote up
/**
 * Set up a MapReduce job to output human-readable text.
 */
protected void configureTextOutput(String destination) {
    Path outPath;
    outPath = MRReasoningUtils.getOutputPath(job.getConfiguration(), destination);
    TextOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}
 
Example #5
Source File: HadoopMultipleOutputFormat.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void open(int taskNumber, int numTasks) throws IOException {
    super.open(taskNumber, numTasks);

    synchronized (OPEN_MULTIPLE_MUTEX) {
        try {
            TaskInputOutputContext taskInputOutputContext = new ReduceContextImpl(configuration,
                    context.getTaskAttemptID(), new InputIterator(), new GenericCounter(), new GenericCounter(),
                    recordWriter, outputCommitter, new DummyReporter(), null,
                    BytesWritable.class, BytesWritable.class);
            this.writer = new MultipleOutputs(taskInputOutputContext);
        } catch (InterruptedException e) {
            throw new IOException("Could not create MultipleOutputs.", e);
        }
    }
}
 
Example #6
Source File: FinalResponseReducer.java    From incubator-retired-pirk with Apache License 2.0 6 votes vote down vote up
@Override
public void setup(Context ctx) throws IOException, InterruptedException
{
  super.setup(ctx);

  mos = new MultipleOutputs<>(ctx);

  FileSystem fs = FileSystem.newInstance(ctx.getConfiguration());
  storage = new HadoopFileSystemStore(fs);
  String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir");
  Query query = storage.recall(queryDir, Query.class);
  QueryInfo queryInfo = query.getQueryInfo();

  outputFile = ctx.getConfiguration().get("pirMR.outputFile");

  response = new Response(queryInfo);
}
 
Example #7
Source File: FeatureDataMapper.java    From data-polygamy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/");
    dataset = fileSplitTokens[fileSplitTokens.length-1];
    
    out = new MultipleOutputs<Text,Text>(context);
}
 
Example #8
Source File: FactDistinctColumnsJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, CubeSegment cubeSeg)
        throws IOException {
    FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeSeg.getCubeInstance());
    int numberOfReducers = reducerMapping.getTotalReducerNum();
    logger.info("{} has reducers {}.", this.getClass().getName(), numberOfReducers);
    if (numberOfReducers > 250) {
        throw new IllegalArgumentException(
                "The max reducer number for FactDistinctColumnsJob is 250, but now it is "
                        + numberOfReducers
                        + ", decrease 'kylin.engine.mr.uhc-reducer-count'");
    }

    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setPartitionerClass(FactDistinctColumnPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    // make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    // prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example #9
Source File: BuildGlobalHiveDictPartBuildJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setOutput(Job job, String[] dicColsArr, String outputBase) {
    // make each reducer output to respective dir
    // eg: /user/kylin/tmp/kylin/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/kylin_sales_cube_mr/dict_column=KYLIN_SALES_SELLER_ID/part_sort
    for (int i = 0; i < dicColsArr.length; i++) {
        MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, LongWritable.class, Text.class);
    }
    Path outputPath = new Path(outputBase);
    FileOutputFormat.setOutputPath(job, outputPath);
}
 
Example #10
Source File: BuildGlobalHiveDictTotalBuildJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setOutput(Job job, String[] dicColsArr, String outputBase) {
    // make each reducer output to respective dir
    ///user/prod_kylin/tmp/kylin2/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/bs_order_scene_day_new_cube_clone/dict_column=DM_ES_REPORT_ORDER_VIEW0420_DRIVER_ID/part_sort
    for (int i = 0; i < dicColsArr.length; i++) {
        MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, Text.class, LongWritable.class);
    }
    Path outputPath = new Path(outputBase);
    FileOutputFormat.setOutputPath(job, outputPath);
}
 
Example #11
Source File: BuildGlobalHiveDictPartBuildReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    mos = new MultipleOutputs(context);
    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
}
 
Example #12
Source File: UHCDictionaryReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}
 
Example #13
Source File: ScalarFunctionDataMapper.java    From data-polygamy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/");
    dataset = fileSplitTokens[fileSplitTokens.length-1];
    
    out = new MultipleOutputs<Text,Text>(context);
}
 
Example #14
Source File: AggregationReducer.java    From data-polygamy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    String[] datasetNames = context.getConfiguration().get("dataset-name","").split(",");
    String[] datasetIds = context.getConfiguration().get("dataset-id","").split(",");
    for (int i = 0; i < datasetNames.length; i++)
        idToDataset.put(Integer.parseInt(datasetIds[i]), datasetNames[i]);
    out = new MultipleOutputs<SpatioTemporalWritable,FloatArrayWritable>(context);
    //out = new MultipleOutputs<Text,Text>(context);
}
 
Example #15
Source File: Task.java    From WIFIProbe with Apache License 2.0 5 votes vote down vote up
private boolean analyze(final String inputFilePath,
                           final String outputFilePath,
                           final Long startTime) throws Exception {
    Configuration conf = new Configuration();
    conf.setLong(Holistic.START_TIME, startTime);
    conf.setLong(Holistic.EXECUTE_TIME, executeHourTime);

    Job jobAnalyze = Job.getInstance(conf, "analyze");

    jobAnalyze.setJarByClass(Holistic.class);

    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.NEW_OLD_CUSTOMER,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CUSTOMER_FLOW_KEY,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CYCLE,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.IN_STORE_HOUR,
            TextOutputFormat.class, KeyWrapper.class, Text.class);

    jobAnalyze.setMapperClass(AnalysisMapper.class);
    jobAnalyze.setReducerClass(AnalysisReducer.class);
    jobAnalyze.setCombinerClass(AnalysisCombiner.class);

    jobAnalyze.setOutputKeyClass(LongWritable.class);
    jobAnalyze.setOutputValueClass(Text.class);

    jobAnalyze.setMapOutputKeyClass(KeyWrapper.class);
    jobAnalyze.setMapOutputValueClass(ValueWrapper.class);

    FileInputFormat.addInputPath(jobAnalyze, new Path(inputFilePath));
    FileOutputFormat.setOutputPath(jobAnalyze, new Path(outputFilePath));

    return jobAnalyze.waitForCompletion(true) ;
}
 
Example #16
Source File: ColumnMultReducer.java    From incubator-retired-pirk with Apache License 2.0 5 votes vote down vote up
@Override
public void setup(Context ctx) throws IOException, InterruptedException
{
  super.setup(ctx);

  outputValue = new Text();
  mos = new MultipleOutputs<>(ctx);

  FileSystem fs = FileSystem.newInstance(ctx.getConfiguration());
  String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir");
  query = new HadoopFileSystemStore(fs).recall(queryDir, Query.class);
}
 
Example #17
Source File: MultipleOutputs.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Creates and initializes multiple outputs support,
 * it should be instantiated in the Mapper/Reducer setup method.
 *
 * @param context the TaskInputOutputContext object
 */
public MultipleOutputs(
    TaskInputOutputContext<?, ?, KEYOUT, VALUEOUT> context) {
  this.context = context;
  namedOutputs = Collections.unmodifiableSet(
    new HashSet<String>(MultipleOutputs.getNamedOutputsList(context)));
  recordWriters = new HashMap<String, RecordWriter<?, ?>>();
  countersEnabled = getCountersEnabled(context);
}
 
Example #18
Source File: UHCDictionaryJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example #19
Source File: UHCDictionaryJob.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example #20
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Set up the MapReduce job to output a schema (TBox).
 */
protected void configureSchemaOutput() {
    Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration());
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SchemaWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, "schemaobj",
        SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}
 
Example #21
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}
 
Example #22
Source File: ForwardChain.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
protected void setup(Context context) {
    debugOut = new MultipleOutputs<>(context);
    Configuration conf = context.getConfiguration();
    if (schema == null) {
        schema = MRReasoningUtils.loadSchema(context.getConfiguration());
    }
    debug = MRReasoningUtils.debug(conf);
}
 
Example #23
Source File: ForwardChain.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public void setup(Context context) {
    mout = new MultipleOutputs<>(context);
    Configuration conf = context.getConfiguration();
    if (schema == null) {
        schema = MRReasoningUtils.loadSchema(conf);
    }
    debug = MRReasoningUtils.debug(conf);
}
 
Example #24
Source File: DuplicateElimination.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public void setup(Context context) {
    Configuration conf = context.getConfiguration();
    debug = MRReasoningUtils.debug(conf);
    if (debug) {
        debugOut = new MultipleOutputs<>(context);
    }
}
 
Example #25
Source File: DuplicateElimination.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public void setup(Context context) {
    Configuration conf = context.getConfiguration();
    mout = new MultipleOutputs<>(context);
    current = MRReasoningUtils.getCurrentIteration(conf);
    debug = MRReasoningUtils.debug(conf);
}
 
Example #26
Source File: DataSourceCompReducer.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
protected void setup(Reducer.Context context) {
	
	Configuration conf = context.getConfiguration();
	Type type = new TypeToken<Map<String, String>>() {
	}.getType();

	filesMap = gson.fromJson(conf.get("filesMap"), type);
	validationInfo = gson.fromJson(conf.get("validationInfoJson"), DataSourceCompValidationInfo.class);
	multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
 
Example #27
Source File: DataSourceCompMapper.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
@SuppressWarnings({ "unchecked", "rawtypes" })
protected void setup(Mapper.Context context) {
	Gson gson = new Gson();
	Configuration conf = context.getConfiguration();
	Type type = new TypeToken<Map<String, String>>() {
	}.getType();

	mapperInfo = gson.fromJson(conf.get("mapperInfoJson"), DataSourceCompMapperInfo.class);
	filesMap = gson.fromJson(conf.get("filesMap"), type);
	multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);

}
 
Example #28
Source File: MultipleOutputsJob.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(IOOptions.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path input = new Path(cli.getArgValueAsString(IOOptions.INPUT));
  Path output = new Path(cli.getArgValueAsString(IOOptions.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(MultipleOutputsJob.class);
  job.setMapperClass(Map.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  job.setNumReduceTasks(0);

  MultipleOutputs.addNamedOutput(job, "partition",
      TextOutputFormat.class, Text.class, Text.class);

  return job.waitForCompletion(true) ? 0 : 1;
}
 
Example #29
Source File: BasicJobChaining.java    From hadoop-map-reduce-patterns with Apache License 2.0 5 votes vote down vote up
protected void setup(Context context) throws IOException, InterruptedException {
	average = getAveragePostsPerUser(context.getConfiguration());
	mos = new MultipleOutputs<Text, Text>(context);

	try {
		Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());

		if (files == null || files.length == 0) {
			throw new RuntimeException("User information is not set in DistributedCache");
		}

		// Read all files in the DistributedCache
		for (Path p : files) {
			BufferedReader rdr = new BufferedReader(new InputStreamReader(
					new GZIPInputStream(new FileInputStream(new File(p.toString())))));

			String line;
			// For each record in the user file
			while ((line = rdr.readLine()) != null) {

				// Get the user ID and reputation
				Map<String, String> parsed = MRDPUtils.transformXmlToMap(line);
				String userId = parsed.get("Id");
				String reputation = parsed.get("Reputation");

				if (userId != null && reputation != null) {
					// Map the user ID to the reputation
					userIdToReputation.put(userId, reputation);
				}
			}
		}

	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}
 
Example #30
Source File: BinningTags.java    From hadoop-map-reduce-patterns with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: BinningTags <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}
	Job job = new Job(conf, "Binning Tags");
	job.setJarByClass(BinningTags.class);
	// Configure the MultipleOutputs by adding an output called "bins"
	// With the proper output format and mapper key/value pairs
	MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class,
			Text.class, NullWritable.class);
	// Enable the counters for the job
	// If there are a significant number of different named outputs, this
	// should be disabled
	MultipleOutputs.setCountersEnabled(job, true);
	// Map-only job
	job.setNumReduceTasks(0);
	job.setMapperClass(BinningMapper.class);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(NullWritable.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);
	return success ? 0 : 1;
}