org.apache.hadoop.mapreduce.Job#getInstance

Source File: UpdateCFJob.java From recsys-offline with Apache License 2.0

6 votes

public void run() throws Exception{
	long startTime = System.currentTimeMillis();
	Configuration conf = new Configuration();
	conf.set(TableOutputFormat.OUTPUT_TABLE, Constants.hbase_user_item_pref_table);
	Job job = Job.getInstance(conf, "hbasewriter"+System.currentTimeMillis());
	job.setJarByClass(UpdateCFJob.class);
	job.setMapperClass(TokenizerMapper.class);
	job.setReducerClass(HBaseWriteReducer.class);
	job.setMapOutputKeyClass(IntWritable.class);  
	job.setMapOutputValueClass(Text.class);
	job.setOutputFormatClass(TableOutputFormat.class);
	FileInputFormat.addInputPath(job, new Path(input));
	long endTime = System.currentTimeMillis();
	boolean isFinish = job.waitForCompletion(true);
	if(isFinish){
		logger.info("UpdateCFJob job ["+job.getJobName()+"] run finish.it costs"+ (endTime - startTime) / 1000 +"s.");
	} else {
		logger.error("UpdateCFJob job ["+job.getJobName()+"] run failed.");
	}
}

Source File: Export.java From hbase with Apache License 2.0

6 votes

/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
  String tableName = arguments.getFirst().getNameAsString();
  Path outputDir = arguments.getThird();
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = arguments.getSecond();
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}

Source File: ElemValueCooccurrencesTest.java From marklogic-contentpump with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 1) {
        System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir");
        System.exit(2);
    }

    Job job = Job.getInstance(conf);
    job.setJarByClass(ElemValueCooccurrencesTest.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(ElemCooccurrencesMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, 
            Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, 
        ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: TestJobCounters.java From big-c with Apache License 2.0

6 votes

public static Job createJob() throws IOException {
  final Configuration conf = new Configuration();
  final Job baseJob = Job.getInstance(conf);
  baseJob.setOutputKeyClass(Text.class);
  baseJob.setOutputValueClass(IntWritable.class);
  baseJob.setMapperClass(NewMapTokenizer.class);
  baseJob.setCombinerClass(NewSummer.class);
  baseJob.setReducerClass(NewSummer.class);
  baseJob.setNumReduceTasks(1);
  baseJob.getConfiguration().setInt(JobContext.IO_SORT_MB, 1);
  baseJob.getConfiguration().set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50");
  baseJob.getConfiguration().setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMinInputSplitSize(
      baseJob, Long.MAX_VALUE);
  return baseJob;
}

Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0

5 votes

private static void createStoreSales(ExecutionEnvironment env) throws IOException {
	DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet());

	Job job = Job.getInstance();

	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales"));
	ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class);
	ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetThriftOutputFormat.setCompressOutput(job, true);
	ParquetThriftOutputFormat.setEnableDictionary(job, false);

	storeSales.output(hadoopOutputFormat);
}

Source File: TestMRCJCFileOutputCommitter.java From big-c with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public void testCommitter() throws Exception {
  Job job = Job.getInstance();
  FileOutputFormat.setOutputPath(job, outDir);
  Configuration conf = job.getConfiguration();
  conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
  JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
  TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
  FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext);

  // setup
  committer.setupJob(jContext);
  committer.setupTask(tContext);

  // write output
  TextOutputFormat theOutputFormat = new TextOutputFormat();
  RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
  writeOutput(theRecordWriter, tContext);

  // do commit
  committer.commitTask(tContext);
  committer.commitJob(jContext);

  // validate output
  File expectedFile = new File(new Path(outDir, partFile).toString());
  StringBuffer expectedOutput = new StringBuffer();
  expectedOutput.append(key1).append('\t').append(val1).append("\n");
  expectedOutput.append(val1).append("\n");
  expectedOutput.append(val2).append("\n");
  expectedOutput.append(key2).append("\n");
  expectedOutput.append(key1).append("\n");
  expectedOutput.append(key2).append('\t').append(val2).append("\n");
  String output = UtilsForTests.slurp(expectedFile);
  assertEquals(output, expectedOutput.toString());
  FileUtil.fullyDelete(new File(outDir.toString()));
}

Source File: TestCombineTextInputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 1", 1, splits.size());
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}

Source File: ConfigurationProxyV2.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

public ConfigurationProxyV2( String namedCluster ) throws IOException {
  job = Job.getInstance();

  // Reset static HashSets for logging
  ShimConfigsLoader.CLUSTER_NAME_FOR_LOGGING.clear();
  ShimConfigsLoader.SITE_FILE_NAME.clear();

  addConfigsForJobConf( namedCluster );
}

Source File: MRStressTest.java From incubator-gobblin with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

    CommandLine cli = StressTestUtils.parseCommandLine(OPTIONS, args);

    Configuration configuration = new Configuration();
    if (cli.hasOption(THROTTLING_SERVER_URI.getOpt())) {
      configuration.setBoolean(USE_THROTTLING_SERVER, true);
      String resourceLimited = cli.getOptionValue(RESOURCE_ID_OPT.getOpt(), "MRStressTest");
      configuration.set(RESOURCE_ID, resourceLimited);
      configuration.set(
          BrokerConfigurationKeyGenerator.generateKey(new SharedRestClientFactory(),
              new SharedRestClientKey(RestliLimiterFactory.RESTLI_SERVICE_NAME),
              null, SharedRestClientFactory.SERVER_URI_KEY), cli.getOptionValue(THROTTLING_SERVER_URI.getOpt()));
    }

    if (cli.hasOption(LOCAL_QPS_OPT.getOpt())) {
      configuration .set(LOCALLY_ENFORCED_QPS, cli.getOptionValue(LOCAL_QPS_OPT.getOpt()));
    }

    Job job = Job.getInstance(configuration, "ThrottlingStressTest");
    job.getConfiguration().setBoolean("mapreduce.job.user.classpath.first", true);
    job.getConfiguration().setBoolean("mapreduce.map.speculative", false);

    job.getConfiguration().set(NUM_MAPPERS, cli.getOptionValue(NUM_MAPPERS_OPT.getOpt(), DEFAULT_MAPPERS));
    StressTestUtils.populateConfigFromCli(job.getConfiguration(), cli);

    job.setJarByClass(MRStressTest.class);
    job.setMapperClass(StresserMapper.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setInputFormatClass(MyInputFormat.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileOutputFormat.setOutputPath(job, new Path("/tmp/MRStressTest" + System.currentTimeMillis()));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }

Source File: CalculateSimilarityStep4.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step4InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step4OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");
	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step4_Mapper.class);
	job.setReducerClass(Step4_Reducer.class);

	job.setJarByClass(CalculateSimilarityStep4.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(IntWritable.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Source File: HadoopCmdOutputTest.java From kylin with Apache License 2.0

5 votes

@Test(expected = IllegalStateException.class)
public void testGetMrJobIdThrowsIllegalStateException() throws IOException {
    Job job = Job.getInstance(new Configuration(false));
    HadoopCmdOutput hadoopCmdOutput = new HadoopCmdOutput(job, new StringBuilder());

    assertNull(hadoopCmdOutput.getMrJobId());
}

Source File: MergeDictionaryJob.java From kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: MergeCuboidJob.java From Kylin with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME).toUpperCase();
        KylinConfig config = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        System.out.println("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job);

        // set inputs
        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer - only one
        job.setReducerClass(CuboidReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);

        // add metadata to distributed cache
        attachKylinPropsAndMetadata(cube, job.getConfiguration());

        setReduceTaskNum(job, config, cubeName, 0);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in MergeCuboidJob", e);
        printUsage(options);
        throw e;
    }
}

Source File: MergeCuboidJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String input = getOptionValue(OPTION_INPUT_PATH);
        String output = getOptionValue(OPTION_OUTPUT_PATH);
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment cubeSeg = cube.getSegmentById(segmentID);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        logger.info("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job, cube.getConfig());

        // add metadata to distributed cache
        Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg);
        allSegs.add(cubeSeg);
        attachSegmentsMetadataWithDict(allSegs, job.getConfiguration());

        // Mapper
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer
        job.setReducerClass(CuboidReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set inputs
        IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat();
        outputFormat.configureJobInput(job, input);
        addInputDirs(input, job);

        // set output
        outputFormat.configureJobOutput(job, output, cubeSeg);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Source File: HalyardBulkLoad.java From Halyard with Apache License 2.0

4 votes

@Override
protected int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String workdir = cmd.getOptionValue('w');
    String target = cmd.getOptionValue('t');
    getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
    getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d'));
    getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r'));
    getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3")));
    if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
    getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m')));
    TableMapReduceUtil.addDependencyJars(getConf(),
            NTriplesUtil.class,
            Rio.class,
            AbstractRDFHandler.class,
            RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, source);
        FileOutputFormat.setOutputPath(job, new Path(workdir));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) {
                HalyardTableUtils.truncateTable(hTable).close();
            }
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

Source File: MRInputHelpers.java From tez with Apache License 2.0

4 votes

/**
 * Helper api to generate splits
 * @param conf Configuration with all necessary information set to generate
 * splits. The following are required at a minimum:
 *
 *   - mapred.mapper.new-api: determine whether mapred.InputFormat or
 *     mapreduce.InputFormat is to be used
 *   - mapred.input.format.class or mapreduce.job.inputformat.class:
 *     determines the InputFormat class to be used
 *
 * In addition to this, all the configs needed by the InputFormat class also
 * have to be set. For example, FileInputFormat needs the input directory
 * paths to be set in the config.
 *
 * @param inputSplitsDir Directory in which the splits file and meta info file
 * will be generated. job.split and job.splitmetainfo files in this directory
 * will be overwritten. Should be a fully-qualified path.
 *
 * @return InputSplitInfo containing the split files' information and the
 * number of splits generated to be used to determining parallelism of
 * the map stage.
 *
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private static InputSplitInfoDisk generateInputSplits(Configuration conf,
                                                      Path inputSplitsDir) throws IOException, InterruptedException,
    ClassNotFoundException {
  Job job = Job.getInstance(conf);
  JobConf jobConf = new JobConf(conf);
  conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
  if (jobConf.getUseNewMapper()) {
    LOG.info("Generating new input splits"
        + ", splitsDir=" + inputSplitsDir.toString());
    return writeNewSplits(job, inputSplitsDir);
  } else {
    LOG.info("Generating old input splits"
        + ", splitsDir=" + inputSplitsDir.toString());
    return writeOldSplits(jobConf, inputSplitsDir);
  }
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: OSMConversionRunner.java From geowave with Apache License 2.0

4 votes

@Override
public int run(final String[] args) throws Exception {

  final Configuration conf = getConf();
  final AccumuloRequiredOptions accumuloOptions =
      (AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions();

  // job settings

  final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion");
  job.setJarByClass(OSMConversionRunner.class);

  job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents());
  job.getConfiguration().set("arguments", ingestOptions.serializeToString());

  if (ingestOptions.getVisibilityOptions().getVisibility() != null) {
    job.getConfiguration().set(
        AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY,
        ingestOptions.getVisibilityOptions().getVisibility());
  }

  // input format

  AbstractInputFormat.setConnectorInfo(
      job,
      accumuloOptions.getUser(),
      new PasswordToken(accumuloOptions.getPassword()));
  InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName());
  AbstractInputFormat.setZooKeeperInstance(
      job,
      new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts(
          accumuloOptions.getZookeeper()));
  AbstractInputFormat.setScanAuthorizations(
      job,
      new Authorizations(ingestOptions.getVisibilityOptions().getVisibility()));

  final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class);
  InputFormatBase.addIterator(job, is);
  job.setInputFormatClass(AccumuloInputFormat.class);
  final Range r = new Range();
  // final ArrayList<Pair<Text, Text>> columns = new ArrayList<>();
  InputFormatBase.setRanges(job, Arrays.asList(r));

  // output format
  GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions);
  final AccumuloOptions options = new AccumuloOptions();
  final AdapterStore as =
      new AdapterStoreImpl(
          new AccumuloOperations(
              accumuloOptions.getZookeeper(),
              accumuloOptions.getInstance(),
              accumuloOptions.getUser(),
              accumuloOptions.getPassword(),
              accumuloOptions.getGeoWaveNamespace(),
              options),
          options);
  for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) {
    as.addAdapter(fda);
    GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda);
  }

  final Index primaryIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex);
  job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName());

  job.setOutputFormatClass(GeoWaveOutputFormat.class);
  job.setMapOutputKeyClass(GeoWaveOutputKey.class);
  job.setMapOutputValueClass(SimpleFeature.class);

  // mappper

  job.setMapperClass(OSMConversionMapper.class);

  // reducer
  job.setNumReduceTasks(0);

  return job.waitForCompletion(true) ? 0 : -1;
}

Source File: ConvertToHFiles.java From examples with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
  try {
    Configuration conf = HBaseConfiguration.create();
    Connection connection = ConnectionFactory.createConnection(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    final TableName tableName = TableName.valueOf(args[2]);

    // tag::SETUP[]
    Table table = connection.getTable(tableName);

    Job job = Job.getInstance(conf, "ConvertToHFiles: Convert CSV to HFiles");

    HFileOutputFormat2.configureIncrementalLoad(job, table,
                                      connection.getRegionLocator(tableName)); // <1>
    job.setInputFormatClass(TextInputFormat.class); // <2>

    job.setJarByClass(ConvertToHFiles.class); // <3>
    job.setJar("/home/cloudera/ahae/target/ahae.jar"); // <3>

    job.setMapperClass(ConvertToHFilesMapper.class); // <4>
    job.setMapOutputKeyClass(ImmutableBytesWritable.class); // <5>
    job.setMapOutputValueClass(KeyValue.class); // <6>

    FileInputFormat.setInputPaths(job, inputPath);
    HFileOutputFormat2.setOutputPath(job, new Path(outputPath));
    // end::SETUP[]

    if (!job.waitForCompletion(true)) {
      LOG.error("Failure");
    } else {
      LOG.info("Success");
      return 0;
    }
  } catch (Exception e) {
    e.printStackTrace();
  }
  return 1;
}

Source File: ExtractDictionaryFromGlobalJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);

        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        logger.info("Starting: " + job.getJobName());

        job.getConfiguration().set("mapreduce.map.speculative", "false");
        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(ExtractDictionaryFromGlobalMapper.class);

        // Reducer
        job.setNumReduceTasks(0);

        // Input
        IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment)
                .getFlatTableInputFormat();
        flatTableInputFormat.configureJob(job);
        // Output
        //// prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        deletePath(job.getConfiguration(), output);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Java Code Examples for org.apache.hadoop.mapreduce.Job#getInstance()