Java Code Examples for org.apache.hadoop.mapreduce.Job#getInstance()
The following examples show how to use
org.apache.hadoop.mapreduce.Job#getInstance() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: UpdateCFJob.java From recsys-offline with Apache License 2.0 | 6 votes |
public void run() throws Exception{ long startTime = System.currentTimeMillis(); Configuration conf = new Configuration(); conf.set(TableOutputFormat.OUTPUT_TABLE, Constants.hbase_user_item_pref_table); Job job = Job.getInstance(conf, "hbasewriter"+System.currentTimeMillis()); job.setJarByClass(UpdateCFJob.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(HBaseWriteReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TableOutputFormat.class); FileInputFormat.addInputPath(job, new Path(input)); long endTime = System.currentTimeMillis(); boolean isFinish = job.waitForCompletion(true); if(isFinish){ logger.info("UpdateCFJob job ["+job.getJobName()+"] run finish.it costs"+ (endTime - startTime) / 1000 +"s."); } else { logger.error("UpdateCFJob job ["+job.getJobName()+"] run failed."); } }
Example 2
Source File: Export.java From hbase with Apache License 2.0 | 6 votes |
/** * Sets up the actual job. * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args); String tableName = arguments.getFirst().getNameAsString(); Path outputDir = arguments.getThird(); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Export.class); // Set optional scan parameters Scan s = arguments.getSecond(); IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs. return job; }
Example 3
Source File: ElemValueCooccurrencesTest.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 1) { System.err.println("Usage: ElemValueCooccurrencesTest configFile outputDir"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(ElemValueCooccurrencesTest.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(ElemCooccurrencesMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, ElemValueCooccurrencesFunction.class, ElemValueCooccurrences.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 4
Source File: TestJobCounters.java From big-c with Apache License 2.0 | 6 votes |
public static Job createJob() throws IOException { final Configuration conf = new Configuration(); final Job baseJob = Job.getInstance(conf); baseJob.setOutputKeyClass(Text.class); baseJob.setOutputValueClass(IntWritable.class); baseJob.setMapperClass(NewMapTokenizer.class); baseJob.setCombinerClass(NewSummer.class); baseJob.setReducerClass(NewSummer.class); baseJob.setNumReduceTasks(1); baseJob.getConfiguration().setInt(JobContext.IO_SORT_MB, 1); baseJob.getConfiguration().set(JobContext.MAP_SORT_SPILL_PERCENT, "0.50"); baseJob.getConfiguration().setInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setMinInputSplitSize( baseJob, Long.MAX_VALUE); return baseJob; }
Example 5
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createStoreSales(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales")); ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, false); storeSales.output(hadoopOutputFormat); }
Example 6
Source File: TestMRCJCFileOutputCommitter.java From big-c with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public void testCommitter() throws Exception { Job job = Job.getInstance(); FileOutputFormat.setOutputPath(job, outDir); Configuration conf = job.getConfiguration(); conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext); // setup committer.setupJob(jContext); committer.setupTask(tContext); // write output TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext); writeOutput(theRecordWriter, tContext); // do commit committer.commitTask(tContext); committer.commitJob(jContext); // validate output File expectedFile = new File(new Path(outDir, partFile).toString()); StringBuffer expectedOutput = new StringBuffer(); expectedOutput.append(key1).append('\t').append(val1).append("\n"); expectedOutput.append(val1).append("\n"); expectedOutput.append(val2).append("\n"); expectedOutput.append(key2).append("\n"); expectedOutput.append(key1).append("\n"); expectedOutput.append(key2).append('\t').append(val2).append("\n"); String output = UtilsForTests.slurp(expectedFile); assertEquals(output, expectedOutput.toString()); FileUtil.fullyDelete(new File(outDir.toString())); }
Example 7
Source File: TestCombineTextInputFormat.java From big-c with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ @Test(timeout=10000) public void testGzip() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, conf); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals("compressed splits == 1", 1, splits.size()); List<Text> results = readSplit(format, splits.get(0), job); assertEquals("splits[0] length", 8, results.size()); final String[] firstList = {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"}; final String[] secondList = {"this is a test", "of gzip"}; String first = results.get(0).toString(); if (first.equals(firstList[0])) { testResults(results, firstList, secondList); } else if (first.equals(secondList[0])) { testResults(results, secondList, firstList); } else { fail("unexpected first token!"); } }
Example 8
Source File: ConfigurationProxyV2.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
public ConfigurationProxyV2( String namedCluster ) throws IOException { job = Job.getInstance(); // Reset static HashSets for logging ShimConfigsLoader.CLUSTER_NAME_FOR_LOGGING.clear(); ShimConfigsLoader.SITE_FILE_NAME.clear(); addConfigsForJobConf( namedCluster ); }
Example 9
Source File: MRStressTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { CommandLine cli = StressTestUtils.parseCommandLine(OPTIONS, args); Configuration configuration = new Configuration(); if (cli.hasOption(THROTTLING_SERVER_URI.getOpt())) { configuration.setBoolean(USE_THROTTLING_SERVER, true); String resourceLimited = cli.getOptionValue(RESOURCE_ID_OPT.getOpt(), "MRStressTest"); configuration.set(RESOURCE_ID, resourceLimited); configuration.set( BrokerConfigurationKeyGenerator.generateKey(new SharedRestClientFactory(), new SharedRestClientKey(RestliLimiterFactory.RESTLI_SERVICE_NAME), null, SharedRestClientFactory.SERVER_URI_KEY), cli.getOptionValue(THROTTLING_SERVER_URI.getOpt())); } if (cli.hasOption(LOCAL_QPS_OPT.getOpt())) { configuration .set(LOCALLY_ENFORCED_QPS, cli.getOptionValue(LOCAL_QPS_OPT.getOpt())); } Job job = Job.getInstance(configuration, "ThrottlingStressTest"); job.getConfiguration().setBoolean("mapreduce.job.user.classpath.first", true); job.getConfiguration().setBoolean("mapreduce.map.speculative", false); job.getConfiguration().set(NUM_MAPPERS, cli.getOptionValue(NUM_MAPPERS_OPT.getOpt(), DEFAULT_MAPPERS)); StressTestUtils.populateConfigFromCli(job.getConfiguration(), cli); job.setJarByClass(MRStressTest.class); job.setMapperClass(StresserMapper.class); job.setReducerClass(AggregatorReducer.class); job.setInputFormatClass(MyInputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(DoubleWritable.class); FileOutputFormat.setOutputPath(job, new Path("/tmp/MRStressTest" + System.currentTimeMillis())); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 10
Source File: CalculateSimilarityStep4.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath = ItemBasedCFDriver.path.get("step4InputPath"); String outputPath = ItemBasedCFDriver.path.get("step4OutputPath"); Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ":"); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step4_Mapper.class); job.setReducerClass(Step4_Reducer.class); job.setJarByClass(CalculateSimilarityStep4.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example 11
Source File: HadoopCmdOutputTest.java From kylin with Apache License 2.0 | 5 votes |
@Test(expected = IllegalStateException.class) public void testGetMrJobIdThrowsIllegalStateException() throws IOException { Job job = Job.getInstance(new Configuration(false)); HadoopCmdOutput hadoopCmdOutput = new HadoopCmdOutput(job, new StringBuilder()); assertNull(hadoopCmdOutput.getMrJobId()); }
Example 12
Source File: MergeDictionaryJob.java From kylin with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { try { Options options = new Options(); options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_META_URL); options.addOption(OPTION_MERGE_SEGMENT_IDS); options.addOption(OPTION_OUTPUT_PATH_DICT); options.addOption(OPTION_OUTPUT_PATH_STAT); parseOptions(options, args); final String segmentId = getOptionValue(OPTION_SEGMENT_ID); final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS); final String cubeName = getOptionValue(OPTION_CUBE_NAME); final String metaUrl = getOptionValue(OPTION_META_URL); final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT); final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT); CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); CubeSegment segment = cube.getSegmentById(segmentId); Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName); job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl); job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId); job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds); job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath); job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1)); job.setNumReduceTasks(1); setJobClasspath(job, cube.getConfig()); // dump metadata to HDFS attachSegmentsMetadataWithDict(mergingSeg, metaUrl); // clean output dir HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath)); job.setMapperClass(MergeDictionaryMapper.class); job.setReducerClass(MergeDictionaryReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(IndexArrInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE); SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath)); logger.info("Starting: " + job.getJobName()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 13
Source File: MergeCuboidJob.java From Kylin with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_NAME); options.addOption(OPTION_INPUT_PATH); options.addOption(OPTION_OUTPUT_PATH); parseOptions(options, args); String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(); String segmentName = getOptionValue(OPTION_SEGMENT_NAME).toUpperCase(); KylinConfig config = KylinConfig.getInstanceFromEnv(); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); // start job String jobName = getOptionValue(OPTION_JOB_NAME); System.out.println("Starting: " + jobName); job = Job.getInstance(getConf(), jobName); setJobClasspath(job); // set inputs addInputDirs(getOptionValue(OPTION_INPUT_PATH), job); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); FileOutputFormat.setOutputPath(job, output); // Mapper job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MergeCuboidMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Reducer - only one job.setReducerClass(CuboidReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // set job configuration job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName); // add metadata to distributed cache attachKylinPropsAndMetadata(cube, job.getConfiguration()); setReduceTaskNum(job, config, cubeName, 0); this.deletePath(job.getConfiguration(), output); return waitForCompletion(job); } catch (Exception e) { logger.error("error in MergeCuboidJob", e); printUsage(options); throw e; } }
Example 14
Source File: MergeCuboidJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_INPUT_PATH); options.addOption(OPTION_OUTPUT_PATH); parseOptions(options, args); String input = getOptionValue(OPTION_INPUT_PATH); String output = getOptionValue(OPTION_OUTPUT_PATH); String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT); String segmentID = getOptionValue(OPTION_SEGMENT_ID); CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment cubeSeg = cube.getSegmentById(segmentID); // start job String jobName = getOptionValue(OPTION_JOB_NAME); logger.info("Starting: " + jobName); job = Job.getInstance(getConf(), jobName); setJobClasspath(job, cube.getConfig()); // add metadata to distributed cache Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg); allSegs.add(cubeSeg); attachSegmentsMetadataWithDict(allSegs, job.getConfiguration()); // Mapper job.setMapperClass(MergeCuboidMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Reducer job.setReducerClass(CuboidReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // set inputs IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat(); outputFormat.configureJobInput(job, input); addInputDirs(input, job); // set output outputFormat.configureJobOutput(job, output, cubeSeg); // set job configuration job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 15
Source File: HalyardBulkLoad.java From Halyard with Apache License 2.0 | 4 votes |
@Override protected int run(CommandLine cmd) throws Exception { String source = cmd.getOptionValue('s'); String workdir = cmd.getOptionValue('w'); String target = cmd.getOptionValue('t'); getConf().setBoolean(SKIP_INVALID_PROPERTY, cmd.hasOption('i')); getConf().setBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, cmd.hasOption('d')); getConf().setBoolean(TRUNCATE_PROPERTY, cmd.hasOption('r')); getConf().setInt(SPLIT_BITS_PROPERTY, Integer.parseInt(cmd.getOptionValue('b', "3"))); if (cmd.hasOption('g')) getConf().set(DEFAULT_CONTEXT_PROPERTY, cmd.getOptionValue('g')); getConf().setBoolean(OVERRIDE_CONTEXT_PROPERTY, cmd.hasOption('o')); getConf().setLong(DEFAULT_TIMESTAMP_PROPERTY, Long.parseLong(cmd.getOptionValue('e', String.valueOf(System.currentTimeMillis())))); if (cmd.hasOption('m')) getConf().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(cmd.getOptionValue('m'))); TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + workdir + " -> " + target); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), target, true, getConf().getInt(SPLIT_BITS_PROPERTY, 3))) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, source); FileOutputFormat.setOutputPath(job, new Path(workdir)); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { if (getConf().getBoolean(TRUNCATE_PROPERTY, false)) { HalyardTableUtils.truncateTable(hTable).close(); } new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(workdir), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
Example 16
Source File: MRInputHelpers.java From tez with Apache License 2.0 | 4 votes |
/** * Helper api to generate splits * @param conf Configuration with all necessary information set to generate * splits. The following are required at a minimum: * * - mapred.mapper.new-api: determine whether mapred.InputFormat or * mapreduce.InputFormat is to be used * - mapred.input.format.class or mapreduce.job.inputformat.class: * determines the InputFormat class to be used * * In addition to this, all the configs needed by the InputFormat class also * have to be set. For example, FileInputFormat needs the input directory * paths to be set in the config. * * @param inputSplitsDir Directory in which the splits file and meta info file * will be generated. job.split and job.splitmetainfo files in this directory * will be overwritten. Should be a fully-qualified path. * * @return InputSplitInfo containing the split files' information and the * number of splits generated to be used to determining parallelism of * the map stage. * * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ private static InputSplitInfoDisk generateInputSplits(Configuration conf, Path inputSplitsDir) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(conf); JobConf jobConf = new JobConf(conf); conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false); if (jobConf.getUseNewMapper()) { LOG.info("Generating new input splits" + ", splitsDir=" + inputSplitsDir.toString()); return writeNewSplits(job, inputSplitsDir); } else { LOG.info("Generating old input splits" + ", splitsDir=" + inputSplitsDir.toString()); return writeOldSplits(jobConf, inputSplitsDir); } }
Example 17
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 18
Source File: OSMConversionRunner.java From geowave with Apache License 2.0 | 4 votes |
@Override public int run(final String[] args) throws Exception { final Configuration conf = getConf(); final AccumuloRequiredOptions accumuloOptions = (AccumuloRequiredOptions) inputStoreOptions.getFactoryOptions(); // job settings final Job job = Job.getInstance(conf, ingestOptions.getJobName() + "NodeConversion"); job.setJarByClass(OSMConversionRunner.class); job.getConfiguration().set("osm_mapping", ingestOptions.getMappingContents()); job.getConfiguration().set("arguments", ingestOptions.serializeToString()); if (ingestOptions.getVisibilityOptions().getVisibility() != null) { job.getConfiguration().set( AbstractMapReduceIngest.GLOBAL_VISIBILITY_KEY, ingestOptions.getVisibilityOptions().getVisibility()); } // input format AbstractInputFormat.setConnectorInfo( job, accumuloOptions.getUser(), new PasswordToken(accumuloOptions.getPassword())); InputFormatBase.setInputTableName(job, ingestOptions.getQualifiedTableName()); AbstractInputFormat.setZooKeeperInstance( job, new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts( accumuloOptions.getZookeeper())); AbstractInputFormat.setScanAuthorizations( job, new Authorizations(ingestOptions.getVisibilityOptions().getVisibility())); final IteratorSetting is = new IteratorSetting(50, "WholeRow", WholeRowIterator.class); InputFormatBase.addIterator(job, is); job.setInputFormatClass(AccumuloInputFormat.class); final Range r = new Range(); // final ArrayList<Pair<Text, Text>> columns = new ArrayList<>(); InputFormatBase.setRanges(job, Arrays.asList(r)); // output format GeoWaveOutputFormat.setStoreOptions(job.getConfiguration(), inputStoreOptions); final AccumuloOptions options = new AccumuloOptions(); final AdapterStore as = new AdapterStoreImpl( new AccumuloOperations( accumuloOptions.getZookeeper(), accumuloOptions.getInstance(), accumuloOptions.getUser(), accumuloOptions.getPassword(), accumuloOptions.getGeoWaveNamespace(), options), options); for (final FeatureDataAdapter fda : FeatureDefinitionSet.featureAdapters.values()) { as.addAdapter(fda); GeoWaveOutputFormat.addDataAdapter(job.getConfiguration(), fda); } final Index primaryIndex = new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions()); GeoWaveOutputFormat.addIndex(job.getConfiguration(), primaryIndex); job.getConfiguration().set(AbstractMapReduceIngest.INDEX_NAMES_KEY, primaryIndex.getName()); job.setOutputFormatClass(GeoWaveOutputFormat.class); job.setMapOutputKeyClass(GeoWaveOutputKey.class); job.setMapOutputValueClass(SimpleFeature.class); // mappper job.setMapperClass(OSMConversionMapper.class); // reducer job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : -1; }
Example 19
Source File: ConvertToHFiles.java From examples with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { try { Configuration conf = HBaseConfiguration.create(); Connection connection = ConnectionFactory.createConnection(conf); String inputPath = args[0]; String outputPath = args[1]; final TableName tableName = TableName.valueOf(args[2]); // tag::SETUP[] Table table = connection.getTable(tableName); Job job = Job.getInstance(conf, "ConvertToHFiles: Convert CSV to HFiles"); HFileOutputFormat2.configureIncrementalLoad(job, table, connection.getRegionLocator(tableName)); // <1> job.setInputFormatClass(TextInputFormat.class); // <2> job.setJarByClass(ConvertToHFiles.class); // <3> job.setJar("/home/cloudera/ahae/target/ahae.jar"); // <3> job.setMapperClass(ConvertToHFilesMapper.class); // <4> job.setMapOutputKeyClass(ImmutableBytesWritable.class); // <5> job.setMapOutputValueClass(KeyValue.class); // <6> FileInputFormat.setInputPaths(job, inputPath); HFileOutputFormat2.setOutputPath(job, new Path(outputPath)); // end::SETUP[] if (!job.waitForCompletion(true)) { LOG.error("Failure"); } else { LOG.info("Success"); return 0; } } catch (Exception e) { e.printStackTrace(); } return 1; }
Example 20
Source File: ExtractDictionaryFromGlobalJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBING_JOB_ID); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); String cubeName = getOptionValue(OPTION_CUBE_NAME); String segmentID = getOptionValue(OPTION_SEGMENT_ID); // ---------------------------------------------------------------------------- // add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment segment = cube.getSegmentById(segmentID); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); logger.info("Starting: " + job.getJobName()); job.getConfiguration().set("mapreduce.map.speculative", "false"); setJobClasspath(job, cube.getConfig()); // Mapper job.setMapperClass(ExtractDictionaryFromGlobalMapper.class); // Reducer job.setNumReduceTasks(0); // Input IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment) .getFlatTableInputFormat(); flatTableInputFormat.configureJob(job); // Output //// prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); FileOutputFormat.setOutputPath(job, output); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); deletePath(job.getConfiguration(), output); attachSegmentMetadataWithDict(segment, job.getConfiguration()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }