Java Code Examples for org.apache.hadoop.mapreduce.Job#getInstance()

The following examples show how to use org.apache.hadoop.mapreduce.Job#getInstance() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: UpdateCFJob.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public void run() throws Exception{
	long startTime = System.currentTimeMillis();
	Configuration conf = new Configuration();
	conf.set(TableOutputFormat.OUTPUT_TABLE, Constants.hbase_user_item_pref_table);
	Job job = Job.getInstance(conf, "hbasewriter"+System.currentTimeMillis());
	job.setJarByClass(UpdateCFJob.class);
	job.setMapperClass(TokenizerMapper.class);
	job.setReducerClass(HBaseWriteReducer.class);
	job.setMapOutputKeyClass(IntWritable.class);  
	job.setMapOutputValueClass(Text.class);
	job.setOutputFormatClass(TableOutputFormat.class);
	FileInputFormat.addInputPath(job, new Path(input));
	long endTime = System.currentTimeMillis();
	boolean isFinish = job.waitForCompletion(true);
	if(isFinish){
		logger.info("UpdateCFJob job ["+job.getJobName()+"] run finish.it costs"+ (endTime - startTime) / 1000 +"s.");
	} else {
		logger.error("UpdateCFJob job ["+job.getJobName()+"] run failed.");
	}
}
 
Example 2
Source File: Export.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
  String tableName = arguments.getFirst().getNameAsString();
  Path outputDir = arguments.getThird();
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = arguments.getSecond();
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}
 
Example 3
Source File: ElemValueCooccurrencesTest.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 1) {
        System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir");
        System.exit(2);
    }

    Job job = Job.getInstance(conf);
    job.setJarByClass(ElemValueCooccurrencesTest.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(ElemCooccurrencesMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, 
            Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, 
        ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 4
Source File: TestJobCounters.java    From big-c with Apache License 2.0 6 votes vote down vote up
public static Job createJob() throws IOException {
  final Configuration conf = new Configuration();
  final Job baseJob = Job.getInstance(conf);
  baseJob.setOutputKeyClass(Text.class);
  baseJob.setOutputValueClass(IntWritable.class);
  baseJob.setMapperClass(NewMapTokenizer.class);
  baseJob.setCombinerClass(NewSummer.class);
  baseJob.setReducerClass(NewSummer.class);
  baseJob.setNumReduceTasks(1);
  baseJob.getConfiguration().setInt(JobContext.IO_SORT_MB, 1);
  baseJob.getConfiguration().set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50");
  baseJob.getConfiguration().setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMinInputSplitSize(
      baseJob, Long.MAX_VALUE);
  return baseJob;
}
 
Example 5
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createStoreSales(ExecutionEnvironment env) throws IOException {
	DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet());

	Job job = Job.getInstance();

	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales"));
	ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class);
	ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetThriftOutputFormat.setCompressOutput(job, true);
	ParquetThriftOutputFormat.setEnableDictionary(job, false);

	storeSales.output(hadoopOutputFormat);
}
 
Example 6
Source File: TestMRCJCFileOutputCommitter.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public void testCommitter() throws Exception {
  Job job = Job.getInstance();
  FileOutputFormat.setOutputPath(job, outDir);
  Configuration conf = job.getConfiguration();
  conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
  JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
  TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
  FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext);

  // setup
  committer.setupJob(jContext);
  committer.setupTask(tContext);

  // write output
  TextOutputFormat theOutputFormat = new TextOutputFormat();
  RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
  writeOutput(theRecordWriter, tContext);

  // do commit
  committer.commitTask(tContext);
  committer.commitJob(jContext);

  // validate output
  File expectedFile = new File(new Path(outDir, partFile).toString());
  StringBuffer expectedOutput = new StringBuffer();
  expectedOutput.append(key1).append('\t').append(val1).append("\n");
  expectedOutput.append(val1).append("\n");
  expectedOutput.append(val2).append("\n");
  expectedOutput.append(key2).append("\n");
  expectedOutput.append(key1).append("\n");
  expectedOutput.append(key2).append('\t').append(val2).append("\n");
  String output = UtilsForTests.slurp(expectedFile);
  assertEquals(output, expectedOutput.toString());
  FileUtil.fullyDelete(new File(outDir.toString()));
}
 
Example 7
Source File: TestCombineTextInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 1", 1, splits.size());
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}
 
Example 8
Source File: ConfigurationProxyV2.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public ConfigurationProxyV2( String namedCluster ) throws IOException {
  job = Job.getInstance();

  // Reset static HashSets for logging
  ShimConfigsLoader.CLUSTER_NAME_FOR_LOGGING.clear();
  ShimConfigsLoader.SITE_FILE_NAME.clear();

  addConfigsForJobConf( namedCluster );
}
 
Example 9
Source File: MRStressTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    CommandLine cli = StressTestUtils.parseCommandLine(OPTIONS, args);

    Configuration configuration = new Configuration();
    if (cli.hasOption(THROTTLING_SERVER_URI.getOpt())) {
      configuration.setBoolean(USE_THROTTLING_SERVER, true);
      String resourceLimited = cli.getOptionValue(RESOURCE_ID_OPT.getOpt(), "MRStressTest");
      configuration.set(RESOURCE_ID, resourceLimited);
      configuration.set(
          BrokerConfigurationKeyGenerator.generateKey(new SharedRestClientFactory(),
              new SharedRestClientKey(RestliLimiterFactory.RESTLI_SERVICE_NAME),
              null, SharedRestClientFactory.SERVER_URI_KEY), cli.getOptionValue(THROTTLING_SERVER_URI.getOpt()));
    }

    if (cli.hasOption(LOCAL_QPS_OPT.getOpt())) {
      configuration .set(LOCALLY_ENFORCED_QPS, cli.getOptionValue(LOCAL_QPS_OPT.getOpt()));
    }

    Job job = Job.getInstance(configuration, "ThrottlingStressTest");
    job.getConfiguration().setBoolean("mapreduce.job.user.classpath.first", true);
    job.getConfiguration().setBoolean("mapreduce.map.speculative", false);

    job.getConfiguration().set(NUM_MAPPERS, cli.getOptionValue(NUM_MAPPERS_OPT.getOpt(), DEFAULT_MAPPERS));
    StressTestUtils.populateConfigFromCli(job.getConfiguration(), cli);

    job.setJarByClass(MRStressTest.class);
    job.setMapperClass(StresserMapper.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setInputFormatClass(MyInputFormat.class);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    FileOutputFormat.setOutputPath(job, new Path("/tmp/MRStressTest" + System.currentTimeMillis()));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
 
Example 10
Source File: CalculateSimilarityStep4.java    From RecommendationEngine with MIT License 5 votes vote down vote up
public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step4InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step4OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");
	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step4_Mapper.class);
	job.setReducerClass(Step4_Reducer.class);

	job.setJarByClass(CalculateSimilarityStep4.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(IntWritable.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}
 
Example 11
Source File: HadoopCmdOutputTest.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Test(expected = IllegalStateException.class)
public void testGetMrJobIdThrowsIllegalStateException() throws IOException {
    Job job = Job.getInstance(new Configuration(false));
    HadoopCmdOutput hadoopCmdOutput = new HadoopCmdOutput(job, new StringBuilder());

    assertNull(hadoopCmdOutput.getMrJobId());
}
 
Example 12
Source File: MergeDictionaryJob.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 13
Source File: MergeCuboidJob.java    From Kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME).toUpperCase();
        KylinConfig config = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        System.out.println("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job);

        // set inputs
        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer - only one
        job.setReducerClass(CuboidReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);

        // add metadata to distributed cache
        attachKylinPropsAndMetadata(cube, job.getConfiguration());

        setReduceTaskNum(job, config, cubeName, 0);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in MergeCuboidJob", e);
        printUsage(options);
        throw e;
    }
}
 
Example 14
Source File: MergeCuboidJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String input = getOptionValue(OPTION_INPUT_PATH);
        String output = getOptionValue(OPTION_OUTPUT_PATH);
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment cubeSeg = cube.getSegmentById(segmentID);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        logger.info("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job, cube.getConfig());

        // add metadata to distributed cache
        Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg);
        allSegs.add(cubeSeg);
        attachSegmentsMetadataWithDict(allSegs, job.getConfiguration());

        // Mapper
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer
        job.setReducerClass(CuboidReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set inputs
        IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat();
        outputFormat.configureJobInput(job, input);
        addInputDirs(input, job);

        // set output
        outputFormat.configureJobOutput(job, output, cubeSeg);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 15
Source File: HalyardBulkLoad.java    From Halyard with Apache License 2.0 4 votes vote down vote up
@Override
protected int run(CommandLine cmd) throws Exception {
    String source = cmd.getOptionValue('s');
    String workdir = cmd.getOptionValue('w');
    String target = cmd.getOptionValue('t');
    getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i'));
    getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d'));
    getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r'));
    getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3")));
    if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g'));
    getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o'));
    getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis()))));
    if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m')));
    TableMapReduceUtil.addDependencyJars(getConf(),
            NTriplesUtil.class,
            Rio.class,
            AbstractRDFHandler.class,
            RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, source);
        FileOutputFormat.setOutputPath(job, new Path(workdir));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) {
                HalyardTableUtils.truncateTable(hTable).close();
            }
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}
 
Example 16
Source File: MRInputHelpers.java    From tez with Apache License 2.0 4 votes vote down vote up
/**
 * Helper api to generate splits
 * @param conf Configuration with all necessary information set to generate
 * splits. The following are required at a minimum:
 *
 *   - mapred.mapper.new-api: determine whether mapred.InputFormat or
 *     mapreduce.InputFormat is to be used
 *   - mapred.input.format.class or mapreduce.job.inputformat.class:
 *     determines the InputFormat class to be used
 *
 * In addition to this, all the configs needed by the InputFormat class also
 * have to be set. For example, FileInputFormat needs the input directory
 * paths to be set in the config.
 *
 * @param inputSplitsDir Directory in which the splits file and meta info file
 * will be generated. job.split and job.splitmetainfo files in this directory
 * will be overwritten. Should be a fully-qualified path.
 *
 * @return InputSplitInfo containing the split files' information and the
 * number of splits generated to be used to determining parallelism of
 * the map stage.
 *
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
private static InputSplitInfoDisk generateInputSplits(Configuration conf,
                                                      Path inputSplitsDir) throws IOException, InterruptedException,
    ClassNotFoundException {
  Job job = Job.getInstance(conf);
  JobConf jobConf = new JobConf(conf);
  conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
  if (jobConf.getUseNewMapper()) {
    LOG.info("Generating new input splits"
        + ", splitsDir=" + inputSplitsDir.toString());
    return writeNewSplits(job, inputSplitsDir);
  } else {
    LOG.info("Generating old input splits"
        + ", splitsDir=" + inputSplitsDir.toString());
    return writeOldSplits(jobConf, inputSplitsDir);
  }
}
 
Example 17
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 18
Source File: OSMConversionRunner.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Override
public int run(final String[] args) throws Exception {

  final Configuration conf = getConf();
  final AccumuloRequiredOptions accumuloOptions =
      (AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions();

  // job settings

  final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion");
  job.setJarByClass(OSMConversionRunner.class);

  job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents());
  job.getConfiguration().set("arguments", ingestOptions.serializeToString());

  if (ingestOptions.getVisibilityOptions().getVisibility() != null) {
    job.getConfiguration().set(
        AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY,
        ingestOptions.getVisibilityOptions().getVisibility());
  }

  // input format

  AbstractInputFormat.setConnectorInfo(
      job,
      accumuloOptions.getUser(),
      new PasswordToken(accumuloOptions.getPassword()));
  InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName());
  AbstractInputFormat.setZooKeeperInstance(
      job,
      new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts(
          accumuloOptions.getZookeeper()));
  AbstractInputFormat.setScanAuthorizations(
      job,
      new Authorizations(ingestOptions.getVisibilityOptions().getVisibility()));

  final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class);
  InputFormatBase.addIterator(job, is);
  job.setInputFormatClass(AccumuloInputFormat.class);
  final Range r = new Range();
  // final ArrayList<Pair<Text, Text>> columns = new ArrayList<>();
  InputFormatBase.setRanges(job, Arrays.asList(r));

  // output format
  GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions);
  final AccumuloOptions options = new AccumuloOptions();
  final AdapterStore as =
      new AdapterStoreImpl(
          new AccumuloOperations(
              accumuloOptions.getZookeeper(),
              accumuloOptions.getInstance(),
              accumuloOptions.getUser(),
              accumuloOptions.getPassword(),
              accumuloOptions.getGeoWaveNamespace(),
              options),
          options);
  for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) {
    as.addAdapter(fda);
    GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda);
  }

  final Index primaryIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex);
  job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName());

  job.setOutputFormatClass(GeoWaveOutputFormat.class);
  job.setMapOutputKeyClass(GeoWaveOutputKey.class);
  job.setMapOutputValueClass(SimpleFeature.class);

  // mappper

  job.setMapperClass(OSMConversionMapper.class);

  // reducer
  job.setNumReduceTasks(0);

  return job.waitForCompletion(true) ? 0 : -1;
}
 
Example 19
Source File: ConvertToHFiles.java    From examples with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  try {
    Configuration conf = HBaseConfiguration.create();
    Connection connection = ConnectionFactory.createConnection(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    final TableName tableName = TableName.valueOf(args[2]);

    // tag::SETUP[]
    Table table = connection.getTable(tableName);

    Job job = Job.getInstance(conf, "ConvertToHFiles: Convert CSV to HFiles");

    HFileOutputFormat2.configureIncrementalLoad(job, table,
                                      connection.getRegionLocator(tableName)); // <1>
    job.setInputFormatClass(TextInputFormat.class); // <2>

    job.setJarByClass(ConvertToHFiles.class); // <3>
    job.setJar("/home/cloudera/ahae/target/ahae.jar"); // <3>

    job.setMapperClass(ConvertToHFilesMapper.class); // <4>
    job.setMapOutputKeyClass(ImmutableBytesWritable.class); // <5>
    job.setMapOutputValueClass(KeyValue.class); // <6>

    FileInputFormat.setInputPaths(job, inputPath);
    HFileOutputFormat2.setOutputPath(job, new Path(outputPath));
    // end::SETUP[]

    if (!job.waitForCompletion(true)) {
      LOG.error("Failure");
    } else {
      LOG.info("Success");
      return 0;
    }
  } catch (Exception e) {
    e.printStackTrace();
  }
  return 1;
}
 
Example 20
Source File: ExtractDictionaryFromGlobalJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);

        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        logger.info("Starting: " + job.getJobName());

        job.getConfiguration().set("mapreduce.map.speculative", "false");
        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(ExtractDictionaryFromGlobalMapper.class);

        // Reducer
        job.setNumReduceTasks(0);

        // Input
        IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment)
                .getFlatTableInputFormat();
        flatTableInputFormat.configureJob(job);
        // Output
        //// prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        deletePath(job.getConfiguration(), output);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}