Java Code Examples for org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Step32.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "wiki job one"); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setNumReduceTasks(1); job1.setJarByClass(Step32.class); job1.setMapperClass(WikiMapper32.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(VectorOrPrefWritable.class); job1.setReducerClass(WiKiReducer32.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(VectorOrPrefWritable.class); // the WiKiDriver's out put is this one's input SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH)); SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH)); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } }
Example 2
Source File: Step1.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "step1"); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setNumReduceTasks(1); job1.setJarByClass(Step1.class); job1.setMapperClass(WikiMapper1.class); job1.setMapOutputKeyClass(VarLongWritable.class); job1.setMapOutputValueClass(LongAndFloat.class); job1.setReducerClass(WiKiReducer1.class); job1.setOutputKeyClass(VarLongWritable.class); job1.setOutputValueClass(VectorWritable.class); FileInputFormat.addInputPath(job1, new Path( INPUT_PATH ) ); SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH )); if (!job1.waitForCompletion(true)) { System.exit(1); } }
Example 3
Source File: Step2.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "wiki job two"); job1.setNumReduceTasks(1); job1.setJarByClass(Step2.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(WikiMapper2.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(IntWritable.class); job1.setReducerClass(WiKiReducer2.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(VectorWritable.class); job1.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH)); SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH)); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } }
Example 4
Source File: Step31.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "wiki job three1"); job1.setOutputFormatClass(SequenceFileOutputFormat.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setNumReduceTasks(1); job1.setJarByClass(Step31.class); job1.setMapperClass(WikiMapper31.class); job1.setMapOutputKeyClass(IntWritable.class); job1.setMapOutputValueClass(VectorOrPrefWritable.class); // set a reducer only to use SequenceFileOutputFormat job1.setReducerClass(WiKiReducer31.class); job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(VectorOrPrefWritable.class); // this MR's input is the MR2's output SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH)); SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH)); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } }
Example 5
Source File: AbstractReasoningTool.java From rya with Apache License 2.0 | 5 votes |
/** * Set up a MapReduce job to output newly derived triples. * @param intermediate True if this is intermediate data. Outputs * to [base]-[iteration]-[temp]. */ protected void configureDerivationOutput(boolean intermediate) { Path outPath; Configuration conf = job.getConfiguration(); int iteration = MRReasoningUtils.getCurrentIteration(conf); if (intermediate) { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration + MRReasoningUtils.TEMP_SUFFIX); } else { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration); } SequenceFileOutputFormat.setOutputPath(job, outPath); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, SequenceFileOutputFormat.class, Derivation.class, NullWritable.class); MultipleOutputs.setCountersEnabled(job, true); // Set up an output for diagnostic info, if needed MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); }
Example 6
Source File: DistBlockIntegrityMonitor.java From RDFS with Apache License 2.0 | 5 votes |
/** * creates and submits a job, updates file index and job index */ private void startJob(String jobName, Set<String> lostFiles, Priority priority, long detectTime) throws IOException, InterruptedException, ClassNotFoundException { Path inDir = new Path(JOB_NAME_PREFIX + "/in/" + jobName); Path outDir = new Path(JOB_NAME_PREFIX + "/out/" + jobName); List<String> filesInJob = createInputFile( jobName, inDir, lostFiles); if (filesInJob.isEmpty()) return; Configuration jobConf = new Configuration(getConf()); RaidUtils.parseAndSetOptions(jobConf, priority.configOption); Job job = new Job(jobConf, jobName); job.getConfiguration().set(CORRUPT_FILE_DETECT_TIME, Long.toString(detectTime)); configureJob(job, this.RECONSTRUCTOR_CLASS); job.setJarByClass(getClass()); job.setMapperClass(ReconstructionMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(ReconstructionInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); ReconstructionInputFormat.setInputPaths(job, inDir); SequenceFileOutputFormat.setOutputPath(job, outDir); submitJob(job, filesInJob, priority); List<LostFileInfo> fileInfos = updateFileIndex(jobName, filesInJob, priority); // The implementation of submitJob() need not update jobIndex. // So check if the job exists in jobIndex before updating jobInfos. if (jobIndex.containsKey(job)) { jobIndex.put(job, fileInfos); } numJobsRunning++; }
Example 7
Source File: AbstractReasoningTool.java From rya with Apache License 2.0 | 5 votes |
/** * Set up the MapReduce job to output a schema (TBox). */ protected void configureSchemaOutput() { Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration()); SequenceFileOutputFormat.setOutputPath(job, outPath); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SchemaWritable.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, "schemaobj", SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(job, true); }
Example 8
Source File: JoinSelectStatisticsTest.java From rya with Apache License 2.0 | 5 votes |
public static void initTabToSeqFileJob(Job job, String intable, String outpath) throws AccumuloSecurityException, IOException { Configuration conf = job.getConfiguration(); String username = conf.get(USERNAME); System.out.println("Username is " + username); String password = conf.get(PASSWORD); String instance = conf.get(INSTANCE); System.out.println("Instance is " + instance); AccumuloInputFormat.setMockInstance(job, instance); AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password)); AccumuloInputFormat.setInputTableName(job, intable); job.setInputFormatClass(AccumuloInputFormat.class); job.setMapOutputKeyClass(CompositeType.class); job.setMapOutputValueClass(TripleCard.class); System.out.println("Outpath is " + outpath); // OUTPUT if(outpath.equals("spo")) { SPOOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "spo"); SequenceFileOutputFormat.setOutputPath(job, new Path(SPOOUT.getAbsolutePath())); } else { PROSPECTSOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "prospects"); SequenceFileOutputFormat.setOutputPath(job, new Path(PROSPECTSOUT.getAbsolutePath())); } job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(CompositeType.class); job.setOutputValueClass(TripleCard.class); }
Example 9
Source File: JoinSelectStatisticsTest.java From rya with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String outpath = conf.get(OUTPUTPATH); Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis()); job.setJarByClass(this.getClass()); conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true); MultipleInputs.addInputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()), SequenceFileInputFormat.class, JoinSelectAggregateMapper.class); MultipleInputs.addInputPath(job,new Path(SPOOUT.getAbsolutePath()) , SequenceFileInputFormat.class, JoinSelectAggregateMapper.class); job.setMapOutputKeyClass(CompositeType.class); job.setMapOutputValueClass(TripleCard.class); tempDir = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + ""); SequenceFileOutputFormat.setOutputPath(job, new Path(tempDir.getAbsolutePath())); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(TripleEntry.class); job.setOutputValueClass(CardList.class); job.setSortComparatorClass(JoinSelectSortComparator.class); job.setGroupingComparatorClass(JoinSelectGroupComparator.class); job.setPartitionerClass(JoinSelectPartitioner.class); job.setReducerClass(JoinReducer.class); job.setNumReduceTasks(32); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
Example 10
Source File: TestMROutputLegacy.java From tez with Apache License 2.0 | 5 votes |
@Test (timeout = 5000) public void testNewAPI_MapperOnly() throws Exception { String outputPath = TEST_DIR.getAbsolutePath(); Job job = Job.getInstance(); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().setBoolean("mapred.mapper.new-api", true); // the output is attached to mapper job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, true); UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration()); OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()) .setUserPayload(vertexPayload); DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null); OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload()); MROutputLegacy output = new MROutputLegacy(outputContext, 2); output.initialize(); assertEquals(true, output.useNewApi); assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass()); assertNull(output.oldOutputFormat); assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass()); assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass()); assertNull(output.oldApiTaskAttemptContext); assertNotNull(output.newRecordWriter); assertNull(output.oldRecordWriter); assertEquals(FileOutputCommitter.class, output.committer.getClass()); }
Example 11
Source File: JoinSelectStatsUtil.java From rya with Apache License 2.0 | 5 votes |
public static void initTabToSeqFileJob(Job job, String intable, String outpath, String auths) throws AccumuloSecurityException { Configuration conf = job.getConfiguration(); String username = conf.get(USERNAME); String password = conf.get(PASSWORD); String instance = conf.get(INSTANCE); String zookeepers = conf.get(ZOOKEEPERS); System.out.println("Zookeepers are " + auths); if (zookeepers != null) { AccumuloInputFormat.setZooKeeperInstance(job, instance, zookeepers); } else { throw new IllegalArgumentException("Must specify either mock or zookeepers"); } AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password)); AccumuloInputFormat.setScanAuthorizations(job, new Authorizations(auths)); AccumuloInputFormat.setInputTableName(job, intable); job.setInputFormatClass(AccumuloInputFormat.class); job.setMapOutputKeyClass(CompositeType.class); job.setMapOutputValueClass(TripleCard.class); // OUTPUT SequenceFileOutputFormat.setOutputPath(job, new Path(outpath)); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(CompositeType.class); job.setOutputValueClass(TripleCard.class); }
Example 12
Source File: TestMROutputLegacy.java From tez with Apache License 2.0 | 5 votes |
@Test (timeout = 5000) public void testNewAPI_MR() throws Exception { String outputPath = TEST_DIR.getAbsolutePath(); Job job = Job.getInstance(); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().setBoolean("mapred.reducer.new-api", true); // the output is attached to reducer job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false); UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration()); OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName()) .setUserPayload(vertexPayload); DataSinkDescriptor sink = DataSinkDescriptor.create(od, OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null); OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload()); MROutputLegacy output = new MROutputLegacy(outputContext, 2); output.initialize(); assertEquals(true, output.useNewApi); assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass()); assertNull(output.oldOutputFormat); assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass()); assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass()); assertNull(output.oldApiTaskAttemptContext); assertNotNull(output.newRecordWriter); assertNull(output.oldRecordWriter); assertEquals(FileOutputCommitter.class, output.committer.getClass()); }
Example 13
Source File: PcapJob.java From metron with Apache License 2.0 | 5 votes |
/** * Creates, but does not submit the job. This is the core MapReduce mrJob. Empty input path * results in a null to be returned instead of creating the job. */ public Job createJob(Optional<String> jobName ,Path basePath , Path jobOutputPath , long beginNS , long endNS , int numReducers , T fields , Configuration conf , FileSystem fs , PcapFilterConfigurator<T> filterImpl ) throws IOException { Iterable<String> filteredPaths = FileFilterUtil.getPathsInTimeRange(beginNS, endNS, listFiles(fs, basePath)); String inputPaths = Joiner.on(',').join(filteredPaths); if (StringUtils.isEmpty(inputPaths)) { return null; } conf.set(START_TS_CONF, Long.toUnsignedString(beginNS)); conf.set(END_TS_CONF, Long.toUnsignedString(endNS)); conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers)); filterImpl.addToConfig(fields, conf); Job job = Job.getInstance(conf); jobName.ifPresent(job::setJobName); job.setJarByClass(PcapJob.class); job.setMapperClass(PcapJob.PcapMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setNumReduceTasks(numReducers); job.setReducerClass(PcapReducer.class); job.setPartitionerClass(PcapPartitioner.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); SequenceFileInputFormat.addInputPaths(job, inputPaths); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, jobOutputPath); return job; }
Example 14
Source File: FlinkMergingDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS); final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT); final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT); final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE); boolean enableObjectReuse = false; if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) { enableObjectReuse = true; } final Job job = Job.getInstance(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); if (enableObjectReuse) { env.getConfig().enableObjectReuse(); } HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName()); logger.info("Dictionary output path: {}", dictOutputPath); logger.info("Statistics output path: {}", statOutputPath); final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]); final int columnLength = tblColRefs.length; List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength); for (int i = 0; i <= columnLength; i++) { indexs.add(i); } DataSource<Integer> indexDS = env.fromCollection(indexs); DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); FlinkUtil.setHadoopConfForCuboid(job, null, null); HadoopOutputFormat<Text, Text> hadoopOF = new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job); SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath)); colToDictPathDS.output(hadoopOF).setParallelism(1); env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); }
Example 15
Source File: DataProfNoCriteriaJobExecutor.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
/*** * main method for job execution * @param args */ public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{ Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+ otherArgs[1]); StringBuilder sb = new StringBuilder(); int dynamicArgs = 0; dynamicArgs = ((otherArgs.length)-1); for (int i = dynamicArgs; i < otherArgs.length; i++) { LOGGER.debug("other arguments" + otherArgs[i]); sb.append(otherArgs[i]); } String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime(); String inputPath = otherArgs[0].replace(" ", ""); String dpBeanString = sb.toString(); LOGGER.debug("Received dpBean value [" + dpBeanString+"]"); Gson gson = new Gson(); Type type = new TypeToken<DataProfilingBean>() { }.getType(); DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type); String recordSeparator = dataProfilingBean.getRecordSeparator(); conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString); conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator); conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator); Job job = new Job(conf, DataProfilingConstants.JOB_NAME); job.setJarByClass(DataProfNoCriteriaJobExecutor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(DataProfNoCriteriaMapper.class); job.setCombinerClass(DataProfNoCriteriaReducer.class); job.setReducerClass(DataProfNoCriteriaReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath); TextInputFormat.setInputPaths(job, inputPaths); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); Map<String, Integer> sortedMap = readJobOutputFromHdfs(conf,outputPath); final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create(); final String jsonString = dpReportGson.toJson(sortedMap); LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString); }
Example 16
Source File: DataProfilingJobExecutor.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
/** * @param args */ public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{ Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+ otherArgs[1]); StringBuilder sb = new StringBuilder(); int dynamicArgs = 0; dynamicArgs = ((otherArgs.length)-1); for (int i = dynamicArgs; i < otherArgs.length; i++) { LOGGER.debug("other arguments" + otherArgs[i]); sb.append(otherArgs[i]); } String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime(); String inputPath = otherArgs[0]; String dpBeanString = sb.toString(); LOGGER.debug("Received dpBean value [" + dpBeanString+"]"); Gson gson = new Gson(); Type type = new TypeToken<DataProfilingBean>() { }.getType(); DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type); String recordSeparator = dataProfilingBean.getRecordSeparator(); conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString); conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator); conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator); Job job = new Job(conf,DataProfilingConstants.JOB_NAME); job.setJarByClass(DataProfilingJobExecutor.class); job.setMapperClass(DataProfilingMapper.class); job.setCombinerClass(DataProfilingReducer.class); job.setReducerClass(DataProfilingReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath); TextInputFormat.setInputPaths(job, inputPaths); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); LOGGER.debug("Job completed , now going to read the result from hdfs"); Set<CriteriaBasedDataProfiling> criteriaBasedDataProfilings = readJobOutputFromHdfs(conf,outputPath,dataProfilingBean); final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create(); final String jsonString = dpReportGson.toJson(criteriaBasedDataProfilings); LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString); }
Example 17
Source File: ExportHBaseTableToDelimiteredSeq.java From HBase-ToHDFS with Apache License 2.0 | 4 votes |
public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String delimiter = args[5]; String rowKeyColumn = ""; if (args.length > 6) { rowKeyColumn = args[6]; } Job job = Job.getInstance(); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath); job.getConfiguration().set(DELIMITER_CONF, delimiter); job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class); job.setJobName("ExportHBaseTableToDelimiteredSeq "); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob(table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if (compressionCodec.equals("snappy")) { SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { //nothing } job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); }
Example 18
Source File: TotalOrderSortingStage.java From hadoop-map-reduce-patterns with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); Path partitionFile = new Path(args[1] + "_partitions.lst"); Path outputStage = new Path(args[1] + "_staging"); Path outputOrder = new Path(args[1]); // Configure job to prepare for sampling Job sampleJob = new Job(conf, "TotalOrderSortingStage"); sampleJob.setJarByClass(TotalOrderSortingStage.class); // Use the mapper implementation with zero reduce tasks sampleJob.setMapperClass(LastAccessMapper.class); sampleJob.setNumReduceTasks(0); sampleJob.setOutputKeyClass(Text.class); sampleJob.setOutputValueClass(Text.class); TextInputFormat.setInputPaths(sampleJob, inputPath); // Set the output format to a sequence file sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage); // Submit the job and get completion code. int code = sampleJob.waitForCompletion(true) ? 0 : 1; if (code == 0) { Job orderJob = new Job(conf, "TotalOrderSortingStage"); orderJob.setJarByClass(TotalOrderSortingStage.class); // Here, use the identity mapper to output the key/value pairs in // the SequenceFile orderJob.setMapperClass(Mapper.class); orderJob.setReducerClass(ValuesReducer.class); // Set the number of reduce tasks to an appropriate number for the // amount of data being sorted orderJob.setNumReduceTasks(10); // Use Hadoop's TotalOrderPartitioner class orderJob.setPartitionerClass(TotalOrderPartitioner.class); // Set the partition file TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(), partitionFile); orderJob.setOutputKeyClass(Text.class); orderJob.setOutputValueClass(Text.class); // Set the input to the previous job's output orderJob.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.setInputPaths(orderJob, outputStage); // Set the output path to the command line parameter TextOutputFormat.setOutputPath(orderJob, outputOrder); // Set the separator to an empty string orderJob.getConfiguration().set( "mapred.textoutputformat.separator", ""); // Use the InputSampler to go through the output of the previous // job, sample it, and create the partition file InputSampler.writePartitionFile(orderJob, new InputSampler.RandomSampler(.001, 10000)); // Submit the job code = orderJob.waitForCompletion(true) ? 0 : 2; } // Clean up the partition file and the staging directory FileSystem.get(new Configuration()).delete(partitionFile, false); FileSystem.get(new Configuration()).delete(outputStage, true); return code; }
Example 19
Source File: FlinkMergingDictionary.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS); final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT); final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT); final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE); boolean enableObjectReuse = false; if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) { enableObjectReuse = true; } final Job job = Job.getInstance(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); if (enableObjectReuse) { env.getConfig().enableObjectReuse(); } HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName()); logger.info("Dictionary output path: {}", dictOutputPath); logger.info("Statistics output path: {}", statOutputPath); final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]); final int columnLength = tblColRefs.length; List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength); for (int i = 0; i <= columnLength; i++) { indexs.add(i); } DataSource<Integer> indexDS = env.fromCollection(indexs); DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); FlinkUtil.setHadoopConfForCuboid(job, null, null); HadoopOutputFormat<Text, Text> hadoopOF = new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job); SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath)); colToDictPathDS.output(hadoopOF).setParallelism(1); env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); }
Example 20
Source File: MergeDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { try { Options options = new Options(); options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_META_URL); options.addOption(OPTION_MERGE_SEGMENT_IDS); options.addOption(OPTION_OUTPUT_PATH_DICT); options.addOption(OPTION_OUTPUT_PATH_STAT); parseOptions(options, args); final String segmentId = getOptionValue(OPTION_SEGMENT_ID); final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS); final String cubeName = getOptionValue(OPTION_CUBE_NAME); final String metaUrl = getOptionValue(OPTION_META_URL); final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT); final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT); CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); CubeSegment segment = cube.getSegmentById(segmentId); Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName); job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl); job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId); job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds); job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath); job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1)); job.setNumReduceTasks(1); setJobClasspath(job, cube.getConfig()); // dump metadata to HDFS attachSegmentsMetadataWithDict(mergingSeg, metaUrl); // clean output dir HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath)); job.setMapperClass(MergeDictionaryMapper.class); job.setReducerClass(MergeDictionaryReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(IndexArrInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE); SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath)); logger.info("Starting: " + job.getJobName()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }