Java Code Examples for org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: recsys-offline   Source File: Step2.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }
 
Example 2
public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}
 
Example 3
@Override
public void configureJobOutput(Job job, String output, CubeSegment segment, CuboidScheduler cuboidScheduler,
        int level) throws Exception {
    int reducerNum = 1;
    Class mapperClass = job.getMapperClass();

    //allow user specially set config for base cuboid step
    if (mapperClass == HiveToBaseCuboidMapper.class) {
        for (Map.Entry<String, String> entry : segment.getConfig().getBaseCuboidMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }
    }

    if (mapperClass == HiveToBaseCuboidMapper.class || mapperClass == NDCuboidMapper.class) {
        reducerNum = MapReduceUtil.getLayeredCubingReduceTaskNum(segment, cuboidScheduler,
                AbstractHadoopJob.getTotalMapInputMB(job), level);
    } else if (mapperClass == InMemCuboidMapper.class) {
        reducerNum = MapReduceUtil.getInmemCubingReduceTaskNum(segment, cuboidScheduler);
    }
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(reducerNum);
    HadoopUtil.deletePath(job.getConfiguration(), outputPath);
}
 
Example 4
@Override
protected Class<? extends OutputFormat> getOutputFormatClass()
    throws ClassNotFoundException {
  if (isHCatJob) {
    LOG.debug("Returning HCatOutputFormat for output format");
    return SqoopHCatUtilities.getOutputFormatClass();
  }
  if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
    return RawKeyTextOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.SequenceFile) {
    return SequenceFileOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.AvroDataFile) {
    return AvroOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.ParquetFile) {
    return DatasetKeyOutputFormat.class;
  }

  return null;
}
 
Example 5
Source Project: hadoop   Source File: RandomTextWriterJob.java    License: Apache License 2.0 6 votes vote down vote up
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
 
Example 6
Source Project: big-c   Source File: TestJoinDatamerge.java    License: Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
 
Example 7
Source Project: recsys-offline   Source File: Step4.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  
    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();        
    Job job1 = new Job(conf1, "wiki  job four");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step4.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper4.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);      
    job1.setReducerClass(WiKiReducer4.class);  
    job1.setOutputKeyClass(IntWritable.class);  
   job1.setOutputValueClass(VectorAndPrefsWritable.class);  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT1_PATH));  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT2_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example 8
Source Project: beam   Source File: HadoopFormatIOSequenceFileTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void batchTest() {

  String outputDir = getOutputDirPath("batchTest");

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          Text.class,
          LongWritable.class,
          outputDir,
          REDUCERS_COUNT,
          "0");

  executeBatchTest(
      HadoopFormatIO.<Text, LongWritable>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())),
      outputDir);

  Assert.assertEquals(
      "In lock folder shouldn't be any file", 0, new File(getLocksDirPath()).list().length);
}
 
Example 9
Source Project: beam   Source File: HadoopFormatIOSequenceFileTest.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionView<Configuration> expand(PCollection<? extends KV<KeyT, ValueT>> input) {

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          keyClass,
          valueClass,
          outputDirPath,
          REDUCERS_COUNT,
          String.valueOf(windowNum++));
  return input
      .getPipeline()
      .apply(Create.<Configuration>of(conf))
      .apply(View.<Configuration>asSingleton().withDefaultValue(conf));
}
 
Example 10
Source Project: mrgeo   Source File: HdfsMrsPyramidOutputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(TaskAttemptContext context) throws IOException
{
  CompressionCodec codec = null;
  CompressionType compressionType = CompressionType.NONE;
  if (getCompressOutput(context))
  {
    // find the kind of compression to do
    compressionType = SequenceFileOutputFormat.getOutputCompressionType(context);

    // find the right codec
    codec = getCompressionCodec(context);
  }

  Path file = getDefaultWorkFile(context, "");

  MapFile.Writer out = createMapFileWriter(context, codec, compressionType, file);

  return new Writer(out);
}
 
Example 11
Source Project: oryx   Source File: SaveToHDFSFunction.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + '-' + time.milliseconds() + '.' + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}
 
Example 12
Source Project: flink-perf   Source File: KMeansDriver.java    License: Apache License 2.0 6 votes vote down vote up
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
	Path points = new Path (pointsPath);
	Path seqFile = new Path (seqFilePath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(CenterInitializer.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Centroid.class);
	job.setMapOutputValueClass(Point.class);
	job.setOutputKeyClass(Centroid.class);
	job.setOutputValueClass(Point.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(pointsPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}
 
Example 13
Source Project: kylin   Source File: ConvergeCuboidDataUtil.java    License: Apache License 2.0 6 votes vote down vote up
public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}
 
Example 14
Source Project: kylin   Source File: KafkaFlatTableJob.java    License: Apache License 2.0 6 votes vote down vote up
private void setupMapper(CubeSegment cubeSeg) throws IOException {
    // set the segment's offset info to job conf
    Map<Integer, Long> offsetStart = cubeSeg.getSourcePartitionOffsetStart();
    Map<Integer, Long> offsetEnd = cubeSeg.getSourcePartitionOffsetEnd();

    Integer minPartition = Collections.min(offsetStart.keySet());
    Integer maxPartition = Collections.max(offsetStart.keySet());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MIN, minPartition.toString());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MAX, maxPartition.toString());

    for(Integer partition: offsetStart.keySet()) {
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_START + partition, offsetStart.get(partition).toString());
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_END + partition, offsetEnd.get(partition).toString());
    }

    job.setMapperClass(KafkaFlatTableMapper.class);
    job.setInputFormatClass(KafkaInputFormat.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
}
 
Example 15
Source Project: hbase   Source File: Export.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
  String tableName = arguments.getFirst().getNameAsString();
  Path outputDir = arguments.getThird();
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = arguments.getSecond();
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}
 
Example 16
Source Project: hiped2   Source File: SequenceFileStoreFunc.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void setStoreLocation(String location, Job job)
    throws IOException {
  job.setOutputKeyClass(keyClass);
  job.setOutputValueClass(valueClass);
  if (compressionType != null && compressionCodecClass != null) {
    Class<? extends CompressionCodec> codecClass =
        FileOutputFormat.getOutputCompressorClass(job,
            DefaultCodec.class);
    SequenceFileOutputFormat.
        setOutputCompressorClass(job, codecClass);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.valueOf(compressionType));
  }
  FileOutputFormat.setOutputPath(job, new Path(location));
}
 
Example 17
Source Project: tez   Source File: TestMROutputLegacy.java    License: Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testNewAPI_MapperOnly() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.mapper.new-api", true);
  // the output is attached to mapper
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
 
Example 18
Source Project: kylin-on-parquet-v2   Source File: UHCDictionaryJob.java    License: Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example 19
Source Project: tez   Source File: TestMROutputLegacy.java    License: Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testOldAPI_MapperOnly() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  JobConf conf = new JobConf();
  conf.setOutputKeyClass(NullWritable.class);
  conf.setOutputValueClass(Text.class);
  conf.setOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat.class);
  org.apache.hadoop.mapred.SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));
  // the output is attached to mapper
  conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(conf);
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(false, output.useNewApi);
  assertEquals(org.apache.hadoop.mapred.SequenceFileOutputFormat.class, output.oldOutputFormat.getClass());
  assertNull(output.newOutputFormat);
  assertEquals(NullWritable.class, output.oldApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.oldApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.newApiTaskAttemptContext);
  assertNotNull(output.oldRecordWriter);
  assertNull(output.newRecordWriter);
  assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
 
Example 20
private void setupReducer(Path output, CubeSegment cubeSeg) throws IOException {
    int hllShardBase = MapReduceUtil.getCuboidHLLCounterReducerNum(cubeSeg.getCubeInstance());

    job.setReducerClass(CalculateStatsFromBaseCuboidReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setNumReduceTasks(hllShardBase);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    deletePath(job.getConfiguration(), output);
}
 
Example 21
@Override
public void configureJobOutput(Job job, String output, CubeSegment segment) throws Exception {
    int reducerNum = MapReduceUtil.getLayeredCubingReduceTaskNum(segment, segment.getCuboidScheduler(),
            AbstractHadoopJob.getTotalMapInputMB(job), -1);
    job.setNumReduceTasks(reducerNum);

    Path outputPath = new Path(output);
    HadoopUtil.deletePath(job.getConfiguration(), outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
}
 
Example 22
Source Project: laser   Source File: OnlineFeatureDriver.java    License: Apache License 2.0 5 votes vote down vote up
public static long run(String collection, Path input, Path output,
		Configuration baseConf) throws IOException, ClassNotFoundException,
		InterruptedException {
	Configuration conf = new Configuration(baseConf);
	Job job = Job.getInstance(conf);

	job.setJarByClass(OnlineFeatureDriver.class);
	job.setJobName("GROUP each record's feature BY identifier");

	FileInputFormat.setInputPaths(job, input);
	FileOutputFormat.setOutputPath(job, output);

	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(OnlineVectorWritable.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(ListWritable.class);

	job.setMapperClass(OnlineFeatureMapper.class);
	job.setReducerClass(OnlineFeatureReducer.class);

	HadoopUtil.delete(conf, output);
	boolean succeeded = job.waitForCompletion(true);
	if (!succeeded) {
		throw new IllegalStateException("Job:Group feature,  Failed!");
	}
	Counter counter = job.getCounters().findCounter(
			"org.apache.hadoop.mapred.Task$Counter",
			"REDUCE_OUTPUT_RECORDS");
	long reduceOutputRecords = counter.getValue();

	LOG.info(
			"Job: GROUP each record's feature BY identifier, output recordes = {}",
			reduceOutputRecords);

	return reduceOutputRecords;
}
 
Example 23
Source Project: deeplearning4j   Source File: SparkStorageUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}
 
Example 24
Source Project: pravega-samples   Source File: TeraStreamValidate.java    License: Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length != 5) {
    usage();
    return 2;
  }
  LOG.info("starting");
  Path inputDir = new Path(args[0]);
  Path outputDir = new Path(args[1]);
  getConf().setStrings(INPUT_URI_STRING, args[2]);
  getConf().setStrings(INPUT_SCOPE_NAME, args[3]);
  getConf().setStrings(INPUT_STREAM_NAME, args[4]);
  getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName());

  getConf().setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(getConf());

  TeraInputFormat.setInputPaths(job, inputDir);
  FileOutputFormat.setOutputPath(job, outputDir);

  job.setJobName("TeraStreamValidate");
  job.setJarByClass(TeraStreamValidate.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setMapperClass(TeraSortMapper.class);
  job.setNumReduceTasks(1);

  job.setInputFormatClass(PravegaInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  int ret = job.waitForCompletion(true) ? 0 : 1;
  LOG.info("done");
  return ret;
}
 
Example 25
Source Project: Kylin   Source File: IIDistinctColumnsJob.java    License: Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output) throws IOException {
    job.setReducerClass(IIDistinctColumnsReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.OUTPUT_PATH, output.toString());

    job.setNumReduceTasks(1);

    deletePath(job.getConfiguration(), output);
}
 
Example 26
Source Project: hadoop-sstable   Source File: SimpleExample.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJobName("Simple Example");

    job.setJarByClass(SimpleExample.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(SimpleExampleMapper.class);
    job.setReducerClass(SimpleExampleReducer.class);

    job.setInputFormatClass(SSTableRowInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);

    final String outputPath = cli.getArgs()[1];
    LOG.info("Setting initial output paths to {}", outputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    boolean success = job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
    return success ? 0 : 1;
}
 
Example 27
Source Project: tez   Source File: TestMultiMROutput.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMergeConf() throws Exception {
  JobConf payloadConf = new JobConf();
  payloadConf.set("local-key", "local-value");
  DataSinkDescriptor dataSink = MultiMROutput.createConfigBuilder(
      payloadConf, SequenceFileOutputFormat.class, "/output", false).build();

  Configuration baseConf = new Configuration(false);
  baseConf.set("base-key", "base-value");

  OutputContext outputContext = mock(OutputContext.class);
  ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
  when(outputContext.getUserPayload()).thenReturn(dataSink.getOutputDescriptor().getUserPayload());
  when(outputContext.getApplicationId()).thenReturn(appId);
  when(outputContext.getTaskVertexIndex()).thenReturn(1);
  when(outputContext.getTaskAttemptNumber()).thenReturn(1);
  when(outputContext.getCounters()).thenReturn(new TezCounters());
  when(outputContext.getStatisticsReporter()).thenReturn(mock(OutputStatisticsReporter.class));
  when(outputContext.getContainerConfiguration()).thenReturn(baseConf);

  MultiMROutput output = new MultiMROutput(outputContext, 2);
  output.initialize();

  Configuration mergedConf = output.jobConf;
  assertEquals("base-value", mergedConf.get("base-key"));
  assertEquals("local-value", mergedConf.get("local-key"));
}
 
Example 28
Source Project: DataVec   Source File: SparkStorageUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}
 
Example 29
Source Project: tez   Source File: TestMROutputLegacy.java    License: Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testNewAPI_MR() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
  // the output is attached to reducer
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
 
Example 30
Source Project: incubator-retired-mrql   Source File: MapOperation.java    License: Apache License 2.0 5 votes vote down vote up
/** The cMap physical operator
 * @param map_fnc       mapper function
 * @param acc_fnc       optional accumulator function
 * @param zero          optional the zero value for the accumulator
 * @param source        input data source
 * @param stop_counter  optional counter used in repeat operation
 * @return a new data source that contains the result
 */
public final static DataSet cMap ( Tree map_fnc,         // mapper function
                                   Tree acc_fnc,         // optional accumulator function
                                   Tree zero,            // optional the zero value for the accumulator
                                   DataSet source,       // input data source
                                   String stop_counter ) // optional counter used in repeat operation
                            throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper",map_fnc.toString());
    conf.set("mrql.counter",stop_counter);
    if (zero != null) {
        conf.set("mrql.accumulator",acc_fnc.toString());
        conf.set("mrql.zero",zero.toString());
    } else conf.set("mrql.zero","");
    setupSplits(source,conf);
    Job job = new Job(conf,newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p: source.source)
        MultipleInputs.addInputPath(job,new Path(p.path),(Class<? extends MapReduceMRQLFileInputFormat>)p.inputFormat,cMapMapper.class);
    FileOutputFormat.setOutputPath(job,new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0
             : job.getCounters().findCounter("mrql",stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath,conf),c,outputRecords(job));
}