org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath

Source File: Step32.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job one");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step32.class);  
    job1.setMapperClass(WikiMapper32.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
  
    job1.setReducerClass(WiKiReducer32.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // the WiKiDriver's out put is this one's input  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}

Source File: Step1.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
    Configuration conf1 = new Configuration();

    Job job1 = new Job(conf1, "step1");
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    job1.setNumReduceTasks(1);
    job1.setJarByClass(Step1.class);
    job1.setMapperClass(WikiMapper1.class);
    job1.setMapOutputKeyClass(VarLongWritable.class);
    job1.setMapOutputValueClass(LongAndFloat.class);
    job1.setReducerClass(WiKiReducer1.class);
    job1.setOutputKeyClass(VarLongWritable.class);
    job1.setOutputValueClass(VectorWritable.class);

    FileInputFormat.addInputPath(job1, new Path( INPUT_PATH ) );
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH ));
    if (!job1.waitForCompletion(true)) {
        System.exit(1);
    }
}

Source File: Step2.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }

Source File: Step31.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job three1");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step31.class);  
    job1.setMapperClass(WikiMapper31.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
      
    // set a reducer only to use SequenceFileOutputFormat  
    job1.setReducerClass(WiKiReducer31.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // this MR's input is the MR2's output  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

5 votes

/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}

Source File: DistBlockIntegrityMonitor.java From RDFS with Apache License 2.0

5 votes

/**
 * creates and submits a job, updates file index and job index
 */
private void startJob(String jobName, Set<String> lostFiles, Priority priority, long detectTime)
throws IOException, InterruptedException, ClassNotFoundException {
  Path inDir = new Path(JOB_NAME_PREFIX + "/in/" + jobName);
  Path outDir = new Path(JOB_NAME_PREFIX + "/out/" + jobName);
  List<String> filesInJob = createInputFile(
      jobName, inDir, lostFiles);
  if (filesInJob.isEmpty()) return;

  Configuration jobConf = new Configuration(getConf());
  RaidUtils.parseAndSetOptions(jobConf, priority.configOption);
  Job job = new Job(jobConf, jobName);
  job.getConfiguration().set(CORRUPT_FILE_DETECT_TIME, Long.toString(detectTime));
  configureJob(job, this.RECONSTRUCTOR_CLASS);
  job.setJarByClass(getClass());
  job.setMapperClass(ReconstructionMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(ReconstructionInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  ReconstructionInputFormat.setInputPaths(job, inDir);
  SequenceFileOutputFormat.setOutputPath(job, outDir);
  

  submitJob(job, filesInJob, priority);
  List<LostFileInfo> fileInfos =
    updateFileIndex(jobName, filesInJob, priority);
  // The implementation of submitJob() need not update jobIndex.
  // So check if the job exists in jobIndex before updating jobInfos.
  if (jobIndex.containsKey(job)) {
    jobIndex.put(job, fileInfos);
  }
  numJobsRunning++;
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

5 votes

/**
 * Set up the MapReduce job to output a schema (TBox).
 */
protected void configureSchemaOutput() {
    Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration());
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SchemaWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, "schemaobj",
        SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}

Source File: JoinSelectStatisticsTest.java From rya with Apache License 2.0

5 votes

public static void initTabToSeqFileJob(Job job, String intable, String outpath) throws AccumuloSecurityException, IOException {
   
   Configuration conf = job.getConfiguration();
  
   String username = conf.get(USERNAME);
   System.out.println("Username is " + username);
   String password = conf.get(PASSWORD);
   String instance = conf.get(INSTANCE);
   System.out.println("Instance is " + instance);
   
  
   AccumuloInputFormat.setMockInstance(job, instance);
   AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password));
   AccumuloInputFormat.setInputTableName(job, intable);
   
   job.setInputFormatClass(AccumuloInputFormat.class);
   job.setMapOutputKeyClass(CompositeType.class);
   job.setMapOutputValueClass(TripleCard.class);

   System.out.println("Outpath is " + outpath);
   
   // OUTPUT
   if(outpath.equals("spo")) {
       SPOOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "spo");
       SequenceFileOutputFormat.setOutputPath(job, new Path(SPOOUT.getAbsolutePath()));
   } else {
       PROSPECTSOUT = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "prospects");
       SequenceFileOutputFormat.setOutputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()));
   }
   job.setOutputFormatClass(SequenceFileOutputFormat.class);
   job.setOutputKeyClass(CompositeType.class);
   job.setOutputValueClass(TripleCard.class);
    
}

Source File: JoinSelectStatisticsTest.java From rya with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    String outpath = conf.get(OUTPUTPATH);
    
    Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
    job.setJarByClass(this.getClass());
    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);
    
    MultipleInputs.addInputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()), 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    MultipleInputs.addInputPath(job,new Path(SPOOUT.getAbsolutePath()) , 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    job.setMapOutputKeyClass(CompositeType.class);
    job.setMapOutputValueClass(TripleCard.class);

    tempDir = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "");
    SequenceFileOutputFormat.setOutputPath(job, new Path(tempDir.getAbsolutePath()));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(TripleEntry.class);
    job.setOutputValueClass(CardList.class);


    job.setSortComparatorClass(JoinSelectSortComparator.class);
    job.setGroupingComparatorClass(JoinSelectGroupComparator.class);
    job.setPartitionerClass(JoinSelectPartitioner.class);
    job.setReducerClass(JoinReducer.class);
    job.setNumReduceTasks(32);
    job.waitForCompletion(true);
    
    return job.isSuccessful() ? 0 : 1;          
}

Source File: TestMROutputLegacy.java From tez with Apache License 2.0

5 votes

@Test (timeout = 5000)
public void testNewAPI_MapperOnly() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.mapper.new-api", true);
  // the output is attached to mapper
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}

Source File: JoinSelectStatsUtil.java From rya with Apache License 2.0

5 votes

public static void initTabToSeqFileJob(Job job, String intable, String outpath, String auths) throws AccumuloSecurityException {

    Configuration conf = job.getConfiguration();
    String username = conf.get(USERNAME);
    String password = conf.get(PASSWORD);
    String instance = conf.get(INSTANCE);
    String zookeepers = conf.get(ZOOKEEPERS);

    System.out.println("Zookeepers are " + auths);

    if (zookeepers != null) {
      AccumuloInputFormat.setZooKeeperInstance(job, instance, zookeepers);
    } else {
      throw new IllegalArgumentException("Must specify either mock or zookeepers");
    }

    AccumuloInputFormat.setConnectorInfo(job, username, new PasswordToken(password));
    AccumuloInputFormat.setScanAuthorizations(job, new Authorizations(auths));
    AccumuloInputFormat.setInputTableName(job, intable);
    job.setInputFormatClass(AccumuloInputFormat.class);
    job.setMapOutputKeyClass(CompositeType.class);
    job.setMapOutputValueClass(TripleCard.class);

    // OUTPUT
    SequenceFileOutputFormat.setOutputPath(job, new Path(outpath));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(CompositeType.class);
    job.setOutputValueClass(TripleCard.class);

  }

Source File: TestMROutputLegacy.java From tez with Apache License 2.0

5 votes

@Test (timeout = 5000)
public void testNewAPI_MR() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
  // the output is attached to reducer
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}

Source File: PcapJob.java From metron with Apache License 2.0

5 votes

/**
 * Creates, but does not submit the job. This is the core MapReduce mrJob. Empty input path
 * results in a null to be returned instead of creating the job.
 */
public Job createJob(Optional<String> jobName
                    ,Path basePath
                    , Path jobOutputPath
                    , long beginNS
                    , long endNS
                    , int numReducers
                    , T fields
                    , Configuration conf
                    , FileSystem fs
                    , PcapFilterConfigurator<T> filterImpl
                    ) throws IOException
{
  Iterable<String> filteredPaths = FileFilterUtil.getPathsInTimeRange(beginNS, endNS, listFiles(fs, basePath));
  String inputPaths = Joiner.on(',').join(filteredPaths);
  if (StringUtils.isEmpty(inputPaths)) {
    return null;
  }
  conf.set(START_TS_CONF, Long.toUnsignedString(beginNS));
  conf.set(END_TS_CONF, Long.toUnsignedString(endNS));
  conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers));
  filterImpl.addToConfig(fields, conf);
  Job job = Job.getInstance(conf);
  jobName.ifPresent(job::setJobName);
  job.setJarByClass(PcapJob.class);
  job.setMapperClass(PcapJob.PcapMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);
  job.setNumReduceTasks(numReducers);
  job.setReducerClass(PcapReducer.class);
  job.setPartitionerClass(PcapPartitioner.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  SequenceFileInputFormat.addInputPaths(job, inputPaths);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, jobOutputPath);
  return job;
}

Source File: FlinkMergingDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}

Source File: DataProfNoCriteriaJobExecutor.java From jumbune with GNU Lesser General Public License v3.0

4 votes

/***
 * main method for job execution
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{

	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0].replace(" ", "");
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);

	Job job = new Job(conf, DataProfilingConstants.JOB_NAME);

	job.setJarByClass(DataProfNoCriteriaJobExecutor.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
	job.setMapperClass(DataProfNoCriteriaMapper.class);
	job.setCombinerClass(DataProfNoCriteriaReducer.class);
	job.setReducerClass(DataProfNoCriteriaReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	
	Map<String, Integer> sortedMap = readJobOutputFromHdfs(conf,outputPath);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(sortedMap);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}

Source File: DataProfilingJobExecutor.java From jumbune with GNU Lesser General Public License v3.0

4 votes

/**
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{
	
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0];
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);
	
	Job job = new Job(conf,DataProfilingConstants.JOB_NAME);
	
	job.setJarByClass(DataProfilingJobExecutor.class);
	job.setMapperClass(DataProfilingMapper.class);
	
	job.setCombinerClass(DataProfilingReducer.class);
	job.setReducerClass(DataProfilingReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(IntWritable.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	LOGGER.debug("Job completed , now going to read the result from hdfs");
	Set<CriteriaBasedDataProfiling> criteriaBasedDataProfilings = readJobOutputFromHdfs(conf,outputPath,dataProfilingBean);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(criteriaBasedDataProfilings);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}

Source File: ExportHBaseTableToDelimiteredSeq.java From HBase-ToHDFS with Apache License 2.0

4 votes

public static void main (String[] args) throws IOException, InterruptedException, ClassNotFoundException {
 if (args.length == 0) {
    System.out
        .println("ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
    return;
  }

  String table = args[0];
  String columnFamily = args[1];
  String outputPath = args[2];
  String compressionCodec = args[3];
  String schemaFilePath = args[4];
  String delimiter = args[5];

  String rowKeyColumn = "";
  if (args.length > 6) {
    rowKeyColumn = args[6];
  }
  
  Job job = Job.getInstance();
  job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);
  
  HBaseConfiguration.addHbaseResources(job.getConfiguration());
  
  job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
  job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
  job.getConfiguration().set(DELIMITER_CONF, delimiter);

  job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
  job.setJobName("ExportHBaseTableToDelimiteredSeq ");

  Scan scan = new Scan();
  scan.setCaching(500); // 1 is the default in Scan, which will be bad for
                        // MapReduce jobs
  scan.setCacheBlocks(false); // don't set to true for MR jobs
  scan.addFamily(Bytes.toBytes(columnFamily));

  TableMapReduceUtil.initTableMapperJob(table, // input HBase table name
      scan, // Scan instance to control CF and attribute selection
      MyMapper.class, // mapper
      null, // mapper output key
      null, // mapper output value
      job);
  job.setOutputFormatClass(SequenceFileOutputFormat.class); 
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  
  if (compressionCodec.equals("snappy")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
  } else if (compressionCodec.equals("gzip")) {
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  } else {
    //nothing
  }
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  
  job.setNumReduceTasks(0);
  
  boolean b = job.waitForCompletion(true);
}

Source File: TotalOrderSortingStage.java From hadoop-map-reduce-patterns with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Path inputPath = new Path(args[0]);
	Path partitionFile = new Path(args[1] + "_partitions.lst");
	Path outputStage = new Path(args[1] + "_staging");
	Path outputOrder = new Path(args[1]);
	// Configure job to prepare for sampling
	Job sampleJob = new Job(conf, "TotalOrderSortingStage");
	sampleJob.setJarByClass(TotalOrderSortingStage.class);
	// Use the mapper implementation with zero reduce tasks
	sampleJob.setMapperClass(LastAccessMapper.class);
	sampleJob.setNumReduceTasks(0);
	sampleJob.setOutputKeyClass(Text.class);
	sampleJob.setOutputValueClass(Text.class);
	TextInputFormat.setInputPaths(sampleJob, inputPath);
	// Set the output format to a sequence file
	sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
	// Submit the job and get completion code.
	int code = sampleJob.waitForCompletion(true) ? 0 : 1;
	if (code == 0) {
		Job orderJob = new Job(conf, "TotalOrderSortingStage");
		orderJob.setJarByClass(TotalOrderSortingStage.class);
		// Here, use the identity mapper to output the key/value pairs in
		// the SequenceFile
		orderJob.setMapperClass(Mapper.class);
		orderJob.setReducerClass(ValuesReducer.class);
		// Set the number of reduce tasks to an appropriate number for the
		// amount of data being sorted
		orderJob.setNumReduceTasks(10);
		// Use Hadoop's TotalOrderPartitioner class
		orderJob.setPartitionerClass(TotalOrderPartitioner.class);
		// Set the partition file
		TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
				partitionFile);
		orderJob.setOutputKeyClass(Text.class);
		orderJob.setOutputValueClass(Text.class);
		// Set the input to the previous job's output
		orderJob.setInputFormatClass(SequenceFileInputFormat.class);
		SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
		// Set the output path to the command line parameter
		TextOutputFormat.setOutputPath(orderJob, outputOrder);
		// Set the separator to an empty string
		orderJob.getConfiguration().set(
				"mapred.textoutputformat.separator", "");
		// Use the InputSampler to go through the output of the previous
		// job, sample it, and create the partition file
		InputSampler.writePartitionFile(orderJob,
				new InputSampler.RandomSampler(.001, 10000));
		// Submit the job
		code = orderJob.waitForCompletion(true) ? 0 : 2;
	}
	// Clean up the partition file and the staging directory
	FileSystem.get(new Configuration()).delete(partitionFile, false);
	FileSystem.get(new Configuration()).delete(outputStage, true);
	return code;
}

Source File: FlinkMergingDictionary.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);
    final String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    final Job job = Job.getInstance();

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }

    HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

    logger.info("Dictionary output path: {}", dictOutputPath);
    logger.info("Statistics output path: {}", statOutputPath);

    final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
    final int columnLength = tblColRefs.length;

    List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

    for (int i = 0; i <= columnLength; i++) {
        indexs.add(i);
    }

    DataSource<Integer> indexDS = env.fromCollection(indexs);

    DataSet<Tuple2<Text, Text>> colToDictPathDS = indexDS.map(new MergeDictAndStatsFunction(cubeName,
            metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

    FlinkUtil.setHadoopConfForCuboid(job, null, null);
    HadoopOutputFormat<Text, Text> hadoopOF =
            new HadoopOutputFormat<>(new SequenceFileOutputFormat<>(), job);
    SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

    colToDictPathDS.output(hadoopOF).setParallelism(1);

    env.execute("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
}

Source File: MergeDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

Java Code Examples for org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat#setOutputPath()