org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat Java Exaples

Source File: KMeansDriver.java From flink-perf with Apache License 2.0

6 votes

public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
	Path points = new Path (pointsPath);
	Path seqFile = new Path (seqFilePath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(CenterInitializer.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Centroid.class);
	job.setMapOutputValueClass(Point.class);
	job.setOutputKeyClass(Centroid.class);
	job.setOutputValueClass(Point.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(pointsPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}

Source File: ConvergeCuboidDataUtil.java From kylin with Apache License 2.0

6 votes

public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}

Source File: RandomTextWriterJob.java From hadoop with Apache License 2.0

6 votes

public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}

Source File: TestJoinDatamerge.java From big-c with Apache License 2.0

6 votes

private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}

Source File: Step4.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  
    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();        
    Job job1 = new Job(conf1, "wiki  job four");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step4.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper4.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);      
    job1.setReducerClass(WiKiReducer4.class);  
    job1.setOutputKeyClass(IntWritable.class);  
   job1.setOutputValueClass(VectorAndPrefsWritable.class);  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT1_PATH));  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT2_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}

Source File: DataDrivenImportJob.java From aliyun-maxcompute-data-collectors with Apache License 2.0

6 votes

@Override
protected Class<? extends OutputFormat> getOutputFormatClass()
    throws ClassNotFoundException {
  if (isHCatJob) {
    LOG.debug("Returning HCatOutputFormat for output format");
    return SqoopHCatUtilities.getOutputFormatClass();
  }
  if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
    return RawKeyTextOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.SequenceFile) {
    return SequenceFileOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.AvroDataFile) {
    return AvroOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.ParquetFile) {
    return DatasetKeyOutputFormat.class;
  }

  return null;
}

Source File: KafkaFlatTableJob.java From kylin with Apache License 2.0

6 votes

private void setupMapper(CubeSegment cubeSeg) throws IOException {
    // set the segment's offset info to job conf
    Map<Integer, Long> offsetStart = cubeSeg.getSourcePartitionOffsetStart();
    Map<Integer, Long> offsetEnd = cubeSeg.getSourcePartitionOffsetEnd();

    Integer minPartition = Collections.min(offsetStart.keySet());
    Integer maxPartition = Collections.max(offsetStart.keySet());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MIN, minPartition.toString());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MAX, maxPartition.toString());

    for(Integer partition: offsetStart.keySet()) {
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_START + partition, offsetStart.get(partition).toString());
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_END + partition, offsetEnd.get(partition).toString());
    }

    job.setMapperClass(KafkaFlatTableMapper.class);
    job.setInputFormatClass(KafkaInputFormat.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
}

Source File: Export.java From hbase with Apache License 2.0

6 votes

/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
  String tableName = arguments.getFirst().getNameAsString();
  Path outputDir = arguments.getThird();
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = arguments.getSecond();
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}

Source File: Step2.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }

Source File: HBaseMROutput2Transition.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
public void configureJobOutput(Job job, String output, CubeSegment segment, CuboidScheduler cuboidScheduler,
        int level) throws Exception {
    int reducerNum = 1;
    Class mapperClass = job.getMapperClass();

    //allow user specially set config for base cuboid step
    if (mapperClass == HiveToBaseCuboidMapper.class) {
        for (Map.Entry<String, String> entry : segment.getConfig().getBaseCuboidMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }
    }

    if (mapperClass == HiveToBaseCuboidMapper.class || mapperClass == NDCuboidMapper.class) {
        reducerNum = MapReduceUtil.getLayeredCubingReduceTaskNum(segment, cuboidScheduler,
                AbstractHadoopJob.getTotalMapInputMB(job), level);
    } else if (mapperClass == InMemCuboidMapper.class) {
        reducerNum = MapReduceUtil.getInmemCubingReduceTaskNum(segment, cuboidScheduler);
    }
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(reducerNum);
    HadoopUtil.deletePath(job.getConfiguration(), outputPath);
}

Source File: HadoopFormatIOSequenceFileTest.java From beam with Apache License 2.0

6 votes

@Test
public void batchTest() {

  String outputDir = getOutputDirPath("batchTest");

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          Text.class,
          LongWritable.class,
          outputDir,
          REDUCERS_COUNT,
          "0");

  executeBatchTest(
      HadoopFormatIO.<Text, LongWritable>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())),
      outputDir);

  Assert.assertEquals(
      "In lock folder shouldn't be any file", 0, new File(getLocksDirPath()).list().length);
}

Source File: SequenceFileStoreFunc.java From hiped2 with Apache License 2.0

6 votes

@Override
public void setStoreLocation(String location, Job job)
    throws IOException {
  job.setOutputKeyClass(keyClass);
  job.setOutputValueClass(valueClass);
  if (compressionType != null && compressionCodecClass != null) {
    Class<? extends CompressionCodec> codecClass =
        FileOutputFormat.getOutputCompressorClass(job,
            DefaultCodec.class);
    SequenceFileOutputFormat.
        setOutputCompressorClass(job, codecClass);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.valueOf(compressionType));
  }
  FileOutputFormat.setOutputPath(job, new Path(location));
}

Source File: ConvergeCuboidDataUtil.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}

Source File: HadoopFormatIOSequenceFileTest.java From beam with Apache License 2.0

6 votes

@Override
public PCollectionView<Configuration> expand(PCollection<? extends KV<KeyT, ValueT>> input) {

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          keyClass,
          valueClass,
          outputDirPath,
          REDUCERS_COUNT,
          String.valueOf(windowNum++));
  return input
      .getPipeline()
      .apply(Create.<Configuration>of(conf))
      .apply(View.<Configuration>asSingleton().withDefaultValue(conf));
}

Source File: SaveToHDFSFunction.java From oryx with Apache License 2.0

6 votes

@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + '-' + time.milliseconds() + '.' + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}

Source File: HdfsMrsPyramidOutputFormat.java From mrgeo with Apache License 2.0

6 votes

@Override
public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(TaskAttemptContext context) throws IOException
{
  CompressionCodec codec = null;
  CompressionType compressionType = CompressionType.NONE;
  if (getCompressOutput(context))
  {
    // find the kind of compression to do
    compressionType = SequenceFileOutputFormat.getOutputCompressionType(context);

    // find the right codec
    codec = getCompressionCodec(context);
  }

  Path file = getDefaultWorkFile(context, "");

  MapFile.Writer out = createMapFileWriter(context, codec, compressionType, file);

  return new Writer(out);
}

Source File: DistBlockIntegrityMonitor.java From RDFS with Apache License 2.0

5 votes

/**
 * determines which files have failed for a given job
 */
private HashMap<String, String> getFailedFiles(Job job) throws IOException {
  HashMap<String, String> failedFiles = new HashMap<String, String>();

  Path outDir = SequenceFileOutputFormat.getOutputPath(job);
  FileSystem fs  = outDir.getFileSystem(getConf());
  if (!fs.getFileStatus(outDir).isDir()) {
    throw new IOException(outDir.toString() + " is not a directory");
  }

  FileStatus[] files = fs.listStatus(outDir);

  for (FileStatus f: files) {
    Path fPath = f.getPath();
    if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
      LOG.info("opening " + fPath.toString());
      SequenceFile.Reader reader = 
        new SequenceFile.Reader(fs, fPath, getConf());

      Text key = new Text();
      Text value = new Text();
      while (reader.next(key, value)) {
        LOG.info("key: " + key.toString() + " , value: " + value.toString());
        failedFiles.put(key.toString(), value.toString());
      }
      reader.close();
    }
  }
  return failedFiles;
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

5 votes

/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}

Source File: IndexerJobDriver.java From incubator-retired-blur with Apache License 2.0

5 votes

private PartitionedInputResult buildPartitionedInputData(String uuid, Path tmpPath, TableDescriptor descriptor,
    List<Path> inprogressPathList, String snapshot, Path fileCachePath) throws IOException, ClassNotFoundException,
    InterruptedException {
  Job job = Job.getInstance(getConf(), "Partitioning data for table [" + descriptor.getName() + "]");
  job.getConfiguration().set(BLUR_UPDATE_ID, uuid);

  // Needed for the bloom filter path information.
  BlurOutputFormat.setTableDescriptor(job, descriptor);
  BlurInputFormat.setLocalCachePath(job, fileCachePath);
  ExistingDataIndexLookupMapper.setSnapshot(job, snapshot);

  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
  }
  Path outputPath = new Path(tmpPath, UUID.randomUUID().toString());
  job.setJarByClass(getClass());
  job.setMapperClass(LookupBuilderMapper.class);
  job.setReducerClass(LookupBuilderReducer.class);

  int shardCount = descriptor.getShardCount();
  job.setNumReduceTasks(shardCount);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(BooleanWritable.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  if (job.waitForCompletion(true)) {
    return new PartitionedInputResult(outputPath, job.getCounters(), shardCount, job.getTaskReports(TaskType.REDUCE));
  } else {
    throw new IOException("Partitioning failed!");
  }
}

Source File: SparkStorageUtils.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}

Source File: PcapJob.java From metron with Apache License 2.0

5 votes

/**
 * Creates, but does not submit the job. This is the core MapReduce mrJob. Empty input path
 * results in a null to be returned instead of creating the job.
 */
public Job createJob(Optional<String> jobName
                    ,Path basePath
                    , Path jobOutputPath
                    , long beginNS
                    , long endNS
                    , int numReducers
                    , T fields
                    , Configuration conf
                    , FileSystem fs
                    , PcapFilterConfigurator<T> filterImpl
                    ) throws IOException
{
  Iterable<String> filteredPaths = FileFilterUtil.getPathsInTimeRange(beginNS, endNS, listFiles(fs, basePath));
  String inputPaths = Joiner.on(',').join(filteredPaths);
  if (StringUtils.isEmpty(inputPaths)) {
    return null;
  }
  conf.set(START_TS_CONF, Long.toUnsignedString(beginNS));
  conf.set(END_TS_CONF, Long.toUnsignedString(endNS));
  conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers));
  filterImpl.addToConfig(fields, conf);
  Job job = Job.getInstance(conf);
  jobName.ifPresent(job::setJobName);
  job.setJarByClass(PcapJob.class);
  job.setMapperClass(PcapJob.PcapMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);
  job.setNumReduceTasks(numReducers);
  job.setReducerClass(PcapReducer.class);
  job.setPartitionerClass(PcapPartitioner.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  SequenceFileInputFormat.addInputPaths(job, inputPaths);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, jobOutputPath);
  return job;
}

Source File: TeraStreamValidate.java From pravega-samples with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
  if (args.length != 5) {
    usage();
    return 2;
  }
  LOG.info("starting");
  Path inputDir = new Path(args[0]);
  Path outputDir = new Path(args[1]);
  getConf().setStrings(INPUT_URI_STRING, args[2]);
  getConf().setStrings(INPUT_SCOPE_NAME, args[3]);
  getConf().setStrings(INPUT_STREAM_NAME, args[4]);
  getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName());

  getConf().setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(getConf());

  TeraInputFormat.setInputPaths(job, inputDir);
  FileOutputFormat.setOutputPath(job, outputDir);

  job.setJobName("TeraStreamValidate");
  job.setJarByClass(TeraStreamValidate.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setMapperClass(TeraSortMapper.class);
  job.setNumReduceTasks(1);

  job.setInputFormatClass(PravegaInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  int ret = job.waitForCompletion(true) ? 0 : 1;
  LOG.info("done");
  return ret;
}

Source File: IIDistinctColumnsJob.java From Kylin with Apache License 2.0

5 votes

private void setupReducer(Path output) throws IOException {
    job.setReducerClass(IIDistinctColumnsReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.OUTPUT_PATH, output.toString());

    job.setNumReduceTasks(1);

    deletePath(job.getConfiguration(), output);
}

Source File: MapOperation.java From incubator-retired-mrql with Apache License 2.0

5 votes

/** The cMap physical operator
 * @param map_fnc       mapper function
 * @param acc_fnc       optional accumulator function
 * @param zero          optional the zero value for the accumulator
 * @param source        input data source
 * @param stop_counter  optional counter used in repeat operation
 * @return a new data source that contains the result
 */
public final static DataSet cMap ( Tree map_fnc,         // mapper function
                                   Tree acc_fnc,         // optional accumulator function
                                   Tree zero,            // optional the zero value for the accumulator
                                   DataSet source,       // input data source
                                   String stop_counter ) // optional counter used in repeat operation
                            throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper",map_fnc.toString());
    conf.set("mrql.counter",stop_counter);
    if (zero != null) {
        conf.set("mrql.accumulator",acc_fnc.toString());
        conf.set("mrql.zero",zero.toString());
    } else conf.set("mrql.zero","");
    setupSplits(source,conf);
    Job job = new Job(conf,newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p: source.source)
        MultipleInputs.addInputPath(job,new Path(p.path),(Class<? extends MapReduceMRQLFileInputFormat>)p.inputFormat,cMapMapper.class);
    FileOutputFormat.setOutputPath(job,new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0
             : job.getCounters().findCounter("mrql",stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath,conf),c,outputRecords(job));
}

Source File: UHCDictionaryJob.java From kylin with Apache License 2.0

5 votes

private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

Source File: SimpleExample.java From hadoop-sstable with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJobName("Simple Example");

    job.setJarByClass(SimpleExample.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(SimpleExampleMapper.class);
    job.setReducerClass(SimpleExampleReducer.class);

    job.setInputFormatClass(SSTableRowInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);

    final String outputPath = cli.getArgs()[1];
    LOG.info("Setting initial output paths to {}", outputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    boolean success = job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
    return success ? 0 : 1;
}

Source File: TestMultiMROutput.java From tez with Apache License 2.0

5 votes

@Test
public void testMergeConf() throws Exception {
  JobConf payloadConf = new JobConf();
  payloadConf.set("local-key", "local-value");
  DataSinkDescriptor dataSink = MultiMROutput.createConfigBuilder(
      payloadConf, SequenceFileOutputFormat.class, "/output", false).build();

  Configuration baseConf = new Configuration(false);
  baseConf.set("base-key", "base-value");

  OutputContext outputContext = mock(OutputContext.class);
  ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
  when(outputContext.getUserPayload()).thenReturn(dataSink.getOutputDescriptor().getUserPayload());
  when(outputContext.getApplicationId()).thenReturn(appId);
  when(outputContext.getTaskVertexIndex()).thenReturn(1);
  when(outputContext.getTaskAttemptNumber()).thenReturn(1);
  when(outputContext.getCounters()).thenReturn(new TezCounters());
  when(outputContext.getStatisticsReporter()).thenReturn(mock(OutputStatisticsReporter.class));
  when(outputContext.getContainerConfiguration()).thenReturn(baseConf);

  MultiMROutput output = new MultiMROutput(outputContext, 2);
  output.initialize();

  Configuration mergedConf = output.jobConf;
  assertEquals("base-value", mergedConf.get("base-key"));
  assertEquals("local-value", mergedConf.get("local-key"));
}

Source File: TestMROutputLegacy.java From tez with Apache License 2.0

5 votes

@Test (timeout = 5000)
public void testNewAPI_MR() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
  // the output is attached to reducer
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}

Source File: FactDistinctColumnsJob.java From kylin with Apache License 2.0

5 votes

private void setupReducer(Path output, CubeSegment cubeSeg)
        throws IOException {
    FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeSeg.getCubeInstance());
    int numberOfReducers = reducerMapping.getTotalReducerNum();
    logger.info("{} has reducers {}.", this.getClass().getName(), numberOfReducers);
    if (numberOfReducers > 250) {
        throw new IllegalArgumentException(
                "The max reducer number for FactDistinctColumnsJob is 250, but now it is "
                        + numberOfReducers
                        + ", decrease 'kylin.engine.mr.uhc-reducer-count'");
    }

    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setPartitionerClass(FactDistinctColumnPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    // make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    // prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

Source File: SparkStorageUtils.java From DataVec with Apache License 2.0

5 votes

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}

org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat Java Examples