Java Code Examples for org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath()

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Step32.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job one");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step32.class);  
    job1.setMapperClass(WikiMapper32.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
  
    job1.setReducerClass(WiKiReducer32.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // the WiKiDriver's out put is this one's input  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example 2
Source File: Step1.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    Configuration conf1 = new Configuration();

    Job job1 = new Job(conf1, "step1");
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    job1.setNumReduceTasks(1);
    job1.setJarByClass(Step1.class);
    job1.setMapperClass(WikiMapper1.class);
    job1.setMapOutputKeyClass(VarLongWritable.class);
    job1.setMapOutputValueClass(LongAndFloat.class);
    job1.setReducerClass(WiKiReducer1.class);
    job1.setOutputKeyClass(VarLongWritable.class);
    job1.setOutputValueClass(VectorWritable.class);

    FileInputFormat.addInputPath(job1, new Path( INPUT_PATH ) );
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH ));
    if (!job1.waitForCompletion(true)) {
        System.exit(1);
    }
}
 
Example 3
Source File: Step2.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }
 
Example 4
Source File: Step31.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job three1");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step31.class);  
    job1.setMapperClass(WikiMapper31.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
      
    // set a reducer only to use SequenceFileOutputFormat  
    job1.setReducerClass(WiKiReducer31.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // this MR's input is the MR2's output  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example 5
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}
 
Example 6
Source File: DistBlockIntegrityMonitor.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * creates and submits a job, updates file index and job index
 */
private void startJob(String jobName, Set<String> lostFiles, Priority priority, long detectTime)
throws IOException, InterruptedException, ClassNotFoundException {
  Path inDir = new Path(JOB_NAME_PREFIX + "/in/" + jobName);
  Path outDir = new Path(JOB_NAME_PREFIX + "/out/" + jobName);
  List<String> filesInJob = createInputFile(
      jobName, inDir, lostFiles);
  if (filesInJob.isEmpty()) return;

  Configuration jobConf = new Configuration(getConf());
  RaidUtils.parseAndSetOptions(jobConf, priority.configOption);
  Job job = new Job(jobConf, jobName);
  job.getConfiguration().set(CORRUPT_FILE_DETECT_TIME, Long.toString(detectTime));
  configureJob(job, this.RECONSTRUCTOR_CLASS);
  job.setJarByClass(getClass());
  job.setMapperClass(ReconstructionMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(ReconstructionInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  ReconstructionInputFormat.setInputPaths(job, inDir);
  SequenceFileOutputFormat.setOutputPath(job, outDir);
  

  submitJob(job, filesInJob, priority);
  List<LostFileInfo> fileInfos =
    updateFileIndex(jobName, filesInJob, priority);
  // The implementation of submitJob() need not update jobIndex.
  // So check if the job exists in jobIndex before updating jobInfos.
  if (jobIndex.containsKey(job)) {
    jobIndex.put(job, fileInfos);
  }
  numJobsRunning++;
}
 
Example 7
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Set up the MapReduce job to output a schema (TBox).
 */
protected void configureSchemaOutput() {
    Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration());
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SchemaWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, "schemaobj",
        SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}
 
Example 8
Source File: JoinSelectStatisticsTest.java    From rya with Apache License 2.0 5 votes vote down vote up
public static void initTabToSeqFileJob(Job job, String intable, String outpath) throws AccumuloSecurityException, IOException {
   
   Configuration conf = job.getConfiguration();
  
   String username = conf.get(USERNAME);
   System.out.println("Username is " + username);
   String password = conf.get(PASSWORD);
   String instance = conf.get(INSTANCE);
   System.out.println("Instance is " + instance);
   
  
   AccumuloInputFormat.setMockInstance(job, instance);
   AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password));
   AccumuloInputFormat.setInputTableName(job, intable);
   
   job.setInputFormatClass(AccumuloInputFormat.class);
   job.setMapOutputKeyClass(CompositeType.class);
   job.setMapOutputValueClass(TripleCard.class);

   System.out.println("Outpath is " + outpath);
   
   // OUTPUT
   if(outpath.equals("spo")) {
       SPOOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "spo");
       SequenceFileOutputFormat.setOutputPath(job, new Path(SPOOUT.getAbsolutePath()));
   } else {
       PROSPECTSOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "prospects");
       SequenceFileOutputFormat.setOutputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()));
   }
   job.setOutputFormatClass(SequenceFileOutputFormat.class);
   job.setOutputKeyClass(CompositeType.class);
   job.setOutputValueClass(TripleCard.class);
    
}
 
Example 9
Source File: JoinSelectStatisticsTest.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    String outpath = conf.get(OUTPUTPATH);
    
    Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
    job.setJarByClass(this.getClass());
    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);
    
    MultipleInputs.addInputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()), 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    MultipleInputs.addInputPath(job,new Path(SPOOUT.getAbsolutePath()) , 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    job.setMapOutputKeyClass(CompositeType.class);
    job.setMapOutputValueClass(TripleCard.class);

    tempDir = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "");
    SequenceFileOutputFormat.setOutputPath(job, new Path(tempDir.getAbsolutePath()));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(TripleEntry.class);
    job.setOutputValueClass(CardList.class);


    job.setSortComparatorClass(JoinSelectSortComparator.class);
    job.setGroupingComparatorClass(JoinSelectGroupComparator.class);
    job.setPartitionerClass(JoinSelectPartitioner.class);
    job.setReducerClass(JoinReducer.class);
    job.setNumReduceTasks(32);
    job.waitForCompletion(true);
    
    return job.isSuccessful() ? 0 : 1;          
}
 
Example 10
Source File: TestMROutputLegacy.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testNewAPI_MapperOnly() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.mapper.new-api", true);
  // the output is attached to mapper
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
 
Example 11
Source File: JoinSelectStatsUtil.java    From rya with Apache License 2.0 5 votes vote down vote up
public static void initTabToSeqFileJob(Job job, String intable, String outpath, String auths) throws AccumuloSecurityException {

    Configuration conf = job.getConfiguration();
    String username = conf.get(USERNAME);
    String password = conf.get(PASSWORD);
    String instance = conf.get(INSTANCE);
    String zookeepers = conf.get(ZOOKEEPERS);

    System.out.println("Zookeepers are " + auths);

    if (zookeepers != null) {
      AccumuloInputFormat.setZooKeeperInstance(job, instance, zookeepers);
    } else {
      throw new IllegalArgumentException("Must specify either mock or zookeepers");
    }

    AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password));
    AccumuloInputFormat.setScanAuthorizations(job, new Authorizations(auths));
    AccumuloInputFormat.setInputTableName(job, intable);
    job.setInputFormatClass(AccumuloInputFormat.class);
    job.setMapOutputKeyClass(CompositeType.class);
    job.setMapOutputValueClass(TripleCard.class);

    // OUTPUT
    SequenceFileOutputFormat.setOutputPath(job, new Path(outpath));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(CompositeType.class);
    job.setOutputValueClass(TripleCard.class);

  }
 
Example 12
Source File: TestMROutputLegacy.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testNewAPI_MR() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
  // the output is attached to reducer
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
 
Example 13
Source File: PcapJob.java    From metron with Apache License 2.0 5 votes vote down vote up
/**
 * Creates, but does not submit the job. This is the core MapReduce mrJob. Empty input path
 * results in a null to be returned instead of creating the job.
 */
public Job createJob(Optional<String> jobName
                    ,Path basePath
                    , Path jobOutputPath
                    , long beginNS
                    , long endNS
                    , int numReducers
                    , T fields
                    , Configuration conf
                    , FileSystem fs
                    , PcapFilterConfigurator<T> filterImpl
                    ) throws IOException
{
  Iterable<String> filteredPaths = FileFilterUtil.getPathsInTimeRange(beginNS, endNS, listFiles(fs, basePath));
  String inputPaths = Joiner.on(',').join(filteredPaths);
  if (StringUtils.isEmpty(inputPaths)) {
    return null;
  }
  conf.set(START_TS_CONF, Long.toUnsignedString(beginNS));
  conf.set(END_TS_CONF, Long.toUnsignedString(endNS));
  conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers));
  filterImpl.addToConfig(fields, conf);
  Job job = Job.getInstance(conf);
  jobName.ifPresent(job::setJobName);
  job.setJarByClass(PcapJob.class);
  job.setMapperClass(PcapJob.PcapMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);
  job.setNumReduceTasks(numReducers);
  job.setReducerClass(PcapReducer.class);
  job.setPartitionerClass(PcapPartitioner.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  SequenceFileInputFormat.addInputPaths(job, inputPaths);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, jobOutputPath);
  return job;
}
 
Example 14
Source File: FlinkMergingDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}
 
Example 15
Source File: DataProfNoCriteriaJobExecutor.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
/***
 * main method for job execution
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{

	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0].replace(" ", "");
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);

	Job job = new Job(conf, DataProfilingConstants.JOB_NAME);

	job.setJarByClass(DataProfNoCriteriaJobExecutor.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
	job.setMapperClass(DataProfNoCriteriaMapper.class);
	job.setCombinerClass(DataProfNoCriteriaReducer.class);
	job.setReducerClass(DataProfNoCriteriaReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	
	Map<String, Integer> sortedMap = readJobOutputFromHdfs(conf,outputPath);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(sortedMap);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}
 
Example 16
Source File: DataProfilingJobExecutor.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{
	
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0];
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);
	
	Job job = new Job(conf,DataProfilingConstants.JOB_NAME);
	
	job.setJarByClass(DataProfilingJobExecutor.class);
	job.setMapperClass(DataProfilingMapper.class);
	
	job.setCombinerClass(DataProfilingReducer.class);
	job.setReducerClass(DataProfilingReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(IntWritable.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	LOGGER.debug("Job completed , now going to read the result from hdfs");
	Set<CriteriaBasedDataProfiling> criteriaBasedDataProfilings = readJobOutputFromHdfs(conf,outputPath,dataProfilingBean);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(criteriaBasedDataProfilings);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}
 
Example 17
Source File: ExportHBaseTableToDelimiteredSeq.java    From HBase-ToHDFS with Apache License 2.0 4 votes vote down vote up
public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException {
 if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String delimiter = args[5];

  String rowKeyColumn = "";
  if (args.length > 6) {
    rowKeyColumn = args[6];
  }
  
  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  
  HBaseConfiguration.addHbaseResources(job.getConfiguration());
  
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
  job.getConfiguration().set(DELIMITER_CONF, delimiter);

  job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
  job.setJobName("ExportHBaseTableToDelimiteredSeq ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(SequenceFileOutputFormat.class); 
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  
  if (compressionCodec.equals("snappy")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    //nothing
  }
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  
  job.setNumReduceTasks(0);
  
  boolean b = job.waitForCompletion(true);
}
 
Example 18
Source File: TotalOrderSortingStage.java    From hadoop-map-reduce-patterns with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Path inputPath = new Path(args[0]);
	Path partitionFile = new Path(args[1] + "_partitions.lst");
	Path outputStage = new Path(args[1] + "_staging");
	Path outputOrder = new Path(args[1]);
	// Configure job to prepare for sampling
	Job sampleJob = new Job(conf, "TotalOrderSortingStage");
	sampleJob.setJarByClass(TotalOrderSortingStage.class);
	// Use the mapper implementation with zero reduce tasks
	sampleJob.setMapperClass(LastAccessMapper.class);
	sampleJob.setNumReduceTasks(0);
	sampleJob.setOutputKeyClass(Text.class);
	sampleJob.setOutputValueClass(Text.class);
	TextInputFormat.setInputPaths(sampleJob, inputPath);
	// Set the output format to a sequence file
	sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
	// Submit the job and get completion code.
	int code = sampleJob.waitForCompletion(true) ? 0 : 1;
	if (code == 0) {
		Job orderJob = new Job(conf, "TotalOrderSortingStage");
		orderJob.setJarByClass(TotalOrderSortingStage.class);
		// Here, use the identity mapper to output the key/value pairs in
		// the SequenceFile
		orderJob.setMapperClass(Mapper.class);
		orderJob.setReducerClass(ValuesReducer.class);
		// Set the number of reduce tasks to an appropriate number for the
		// amount of data being sorted
		orderJob.setNumReduceTasks(10);
		// Use Hadoop's TotalOrderPartitioner class
		orderJob.setPartitionerClass(TotalOrderPartitioner.class);
		// Set the partition file
		TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
				partitionFile);
		orderJob.setOutputKeyClass(Text.class);
		orderJob.setOutputValueClass(Text.class);
		// Set the input to the previous job's output
		orderJob.setInputFormatClass(SequenceFileInputFormat.class);
		SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
		// Set the output path to the command line parameter
		TextOutputFormat.setOutputPath(orderJob, outputOrder);
		// Set the separator to an empty string
		orderJob.getConfiguration().set(
				"mapred.textoutputformat.separator", "");
		// Use the InputSampler to go through the output of the previous
		// job, sample it, and create the partition file
		InputSampler.writePartitionFile(orderJob,
				new InputSampler.RandomSampler(.001, 10000));
		// Submit the job
		code = orderJob.waitForCompletion(true) ? 0 : 2;
	}
	// Clean up the partition file and the staging directory
	FileSystem.get(new Configuration()).delete(partitionFile, false);
	FileSystem.get(new Configuration()).delete(outputStage, true);
	return code;
}
 
Example 19
Source File: FlinkMergingDictionary.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}
 
Example 20
Source File: MergeDictionaryJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}