org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KMeansDriver.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void initializeCenters (Configuration conf, FileSystem fs, String pointsPath, String seqFilePath) throws Exception {
	Path points = new Path (pointsPath);
	Path seqFile = new Path (seqFilePath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(CenterInitializer.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Centroid.class);
	job.setMapOutputValueClass(Point.class);
	job.setOutputKeyClass(Centroid.class);
	job.setOutputValueClass(Point.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(pointsPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}
 
Example #2
Source File: ConvergeCuboidDataUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}
 
Example #3
Source File: RandomTextWriterJob.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public Job createJob(Configuration conf) throws IOException {
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 10 * 1024);
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numBytesToWritePerMap);
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);

  job.setJarByClass(RandomTextWriterJob.class);
  job.setJobName("random-text-writer");

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);

  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);

  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  //FileOutputFormat.setOutputPath(job, new Path("random-output"));
  job.setNumReduceTasks(0);
  return job;
}
 
Example #4
Source File: TestJoinDatamerge.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
 
Example #5
Source File: Step4.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  
    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();        
    Job job1 = new Job(conf1, "wiki  job four");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step4.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper4.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);      
    job1.setReducerClass(WiKiReducer4.class);  
    job1.setOutputKeyClass(IntWritable.class);  
   job1.setOutputValueClass(VectorAndPrefsWritable.class);  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT1_PATH));  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT2_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example #6
Source File: DataDrivenImportJob.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
@Override
protected Class<? extends OutputFormat> getOutputFormatClass()
    throws ClassNotFoundException {
  if (isHCatJob) {
    LOG.debug("Returning HCatOutputFormat for output format");
    return SqoopHCatUtilities.getOutputFormatClass();
  }
  if (options.getFileLayout() == SqoopOptions.FileLayout.TextFile) {
    return RawKeyTextOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.SequenceFile) {
    return SequenceFileOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.AvroDataFile) {
    return AvroOutputFormat.class;
  } else if (options.getFileLayout()
      == SqoopOptions.FileLayout.ParquetFile) {
    return DatasetKeyOutputFormat.class;
  }

  return null;
}
 
Example #7
Source File: KafkaFlatTableJob.java    From kylin with Apache License 2.0 6 votes vote down vote up
private void setupMapper(CubeSegment cubeSeg) throws IOException {
    // set the segment's offset info to job conf
    Map<Integer, Long> offsetStart = cubeSeg.getSourcePartitionOffsetStart();
    Map<Integer, Long> offsetEnd = cubeSeg.getSourcePartitionOffsetEnd();

    Integer minPartition = Collections.min(offsetStart.keySet());
    Integer maxPartition = Collections.max(offsetStart.keySet());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MIN, minPartition.toString());
    job.getConfiguration().set(CONFIG_KAFKA_PARITION_MAX, maxPartition.toString());

    for(Integer partition: offsetStart.keySet()) {
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_START + partition, offsetStart.get(partition).toString());
        job.getConfiguration().set(CONFIG_KAFKA_PARITION_END + partition, offsetEnd.get(partition).toString());
    }

    job.setMapperClass(KafkaFlatTableMapper.class);
    job.setInputFormatClass(KafkaInputFormat.class);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
}
 
Example #8
Source File: Export.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up the actual job.
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException {
  Triple<TableName, Scan, Path> arguments = ExportUtils.getArgumentsFromCommandLine(conf, args);
  String tableName = arguments.getFirst().getNameAsString();
  Path outputDir = arguments.getThird();
  Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
  job.setJobName(NAME + "_" + tableName);
  job.setJarByClass(Export.class);
  // Set optional scan parameters
  Scan s = arguments.getSecond();
  IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
  // No reducers.  Just write straight to output files.
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Result.class);
  FileOutputFormat.setOutputPath(job, outputDir); // job conf doesn't contain the conf so doesn't have a default fs.
  return job;
}
 
Example #9
Source File: Step2.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }
 
Example #10
Source File: HBaseMROutput2Transition.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@Override
public void configureJobOutput(Job job, String output, CubeSegment segment, CuboidScheduler cuboidScheduler,
        int level) throws Exception {
    int reducerNum = 1;
    Class mapperClass = job.getMapperClass();

    //allow user specially set config for base cuboid step
    if (mapperClass == HiveToBaseCuboidMapper.class) {
        for (Map.Entry<String, String> entry : segment.getConfig().getBaseCuboidMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }
    }

    if (mapperClass == HiveToBaseCuboidMapper.class || mapperClass == NDCuboidMapper.class) {
        reducerNum = MapReduceUtil.getLayeredCubingReduceTaskNum(segment, cuboidScheduler,
                AbstractHadoopJob.getTotalMapInputMB(job), level);
    } else if (mapperClass == InMemCuboidMapper.class) {
        reducerNum = MapReduceUtil.getInmemCubingReduceTaskNum(segment, cuboidScheduler);
    }
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(reducerNum);
    HadoopUtil.deletePath(job.getConfiguration(), outputPath);
}
 
Example #11
Source File: HadoopFormatIOSequenceFileTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void batchTest() {

  String outputDir = getOutputDirPath("batchTest");

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          Text.class,
          LongWritable.class,
          outputDir,
          REDUCERS_COUNT,
          "0");

  executeBatchTest(
      HadoopFormatIO.<Text, LongWritable>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())),
      outputDir);

  Assert.assertEquals(
      "In lock folder shouldn't be any file", 0, new File(getLocksDirPath()).list().length);
}
 
Example #12
Source File: SequenceFileStoreFunc.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
@Override
public void setStoreLocation(String location, Job job)
    throws IOException {
  job.setOutputKeyClass(keyClass);
  job.setOutputValueClass(valueClass);
  if (compressionType != null && compressionCodecClass != null) {
    Class<? extends CompressionCodec> codecClass =
        FileOutputFormat.getOutputCompressorClass(job,
            DefaultCodec.class);
    SequenceFileOutputFormat.
        setOutputCompressorClass(job, codecClass);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.valueOf(compressionType));
  }
  FileOutputFormat.setOutputPath(job, new Path(location));
}
 
Example #13
Source File: ConvergeCuboidDataUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static void setupReducer(Job job, CubeSegment cubeSegment, Path output) throws IOException {
    // Output
    //// prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // Reducer
    job.setReducerClass(ConvergeCuboidDataReducer.class);
    job.setPartitionerClass(ConvergeCuboidDataPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    Pair<Integer, Integer> numReduceTasks = MapReduceUtil.getConvergeCuboidDataReduceTaskNums(cubeSegment);
    job.setNumReduceTasks(numReduceTasks.getFirst());

    int nBaseReduceTasks = numReduceTasks.getSecond();
    boolean enableSharding = cubeSegment.isEnableSharding();
    long baseCuboidId = cubeSegment.getCuboidScheduler().getBaseCuboidId();
    String partiParams = enableSharding + "," + baseCuboidId + "," + nBaseReduceTasks;
    job.getConfiguration().set(BatchConstants.CFG_CONVERGE_CUBOID_PARTITION_PARAM, partiParams);
}
 
Example #14
Source File: HadoopFormatIOSequenceFileTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionView<Configuration> expand(PCollection<? extends KV<KeyT, ValueT>> input) {

  Configuration conf =
      createWriteConf(
          SequenceFileOutputFormat.class,
          keyClass,
          valueClass,
          outputDirPath,
          REDUCERS_COUNT,
          String.valueOf(windowNum++));
  return input
      .getPipeline()
      .apply(Create.<Configuration>of(conf))
      .apply(View.<Configuration>asSingleton().withDefaultValue(conf));
}
 
Example #15
Source File: SaveToHDFSFunction.java    From oryx with Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
    log.info("RDD was empty, not saving to HDFS");
  } else {
    String file = prefix + '-' + time.milliseconds() + '.' + suffix;
    Path path = new Path(file);
    FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
    if (fs.exists(path)) {
      log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
      fs.delete(path, true);
    }
    log.info("Saving RDD to HDFS at {}", file);
    rdd.mapToPair(
        new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
    ).saveAsNewAPIHadoopFile(
        file,
        keyWritableClass,
        messageWritableClass,
        SequenceFileOutputFormat.class,
        hadoopConf);
  }
}
 
Example #16
Source File: HdfsMrsPyramidOutputFormat.java    From mrgeo with Apache License 2.0 6 votes vote down vote up
@Override
public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(TaskAttemptContext context) throws IOException
{
  CompressionCodec codec = null;
  CompressionType compressionType = CompressionType.NONE;
  if (getCompressOutput(context))
  {
    // find the kind of compression to do
    compressionType = SequenceFileOutputFormat.getOutputCompressionType(context);

    // find the right codec
    codec = getCompressionCodec(context);
  }

  Path file = getDefaultWorkFile(context, "");

  MapFile.Writer out = createMapFileWriter(context, codec, compressionType, file);

  return new Writer(out);
}
 
Example #17
Source File: DistBlockIntegrityMonitor.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * determines which files have failed for a given job
 */
private HashMap<String, String> getFailedFiles(Job job) throws IOException {
  HashMap<String, String> failedFiles = new HashMap<String, String>();

  Path outDir = SequenceFileOutputFormat.getOutputPath(job);
  FileSystem fs  = outDir.getFileSystem(getConf());
  if (!fs.getFileStatus(outDir).isDir()) {
    throw new IOException(outDir.toString() + " is not a directory");
  }

  FileStatus[] files = fs.listStatus(outDir);

  for (FileStatus f: files) {
    Path fPath = f.getPath();
    if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
      LOG.info("opening " + fPath.toString());
      SequenceFile.Reader reader = 
        new SequenceFile.Reader(fs, fPath, getConf());

      Text key = new Text();
      Text value = new Text();
      while (reader.next(key, value)) {
        LOG.info("key: " + key.toString() + " , value: " + value.toString());
        failedFiles.put(key.toString(), value.toString());
      }
      reader.close();
    }
  }
  return failedFiles;
}
 
Example #18
Source File: AbstractReasoningTool.java    From rya with Apache License 2.0 5 votes vote down vote up
/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}
 
Example #19
Source File: IndexerJobDriver.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private PartitionedInputResult buildPartitionedInputData(String uuid, Path tmpPath, TableDescriptor descriptor,
    List<Path> inprogressPathList, String snapshot, Path fileCachePath) throws IOException, ClassNotFoundException,
    InterruptedException {
  Job job = Job.getInstance(getConf(), "Partitioning data for table [" + descriptor.getName() + "]");
  job.getConfiguration().set(BLUR_UPDATE_ID, uuid);

  // Needed for the bloom filter path information.
  BlurOutputFormat.setTableDescriptor(job, descriptor);
  BlurInputFormat.setLocalCachePath(job, fileCachePath);
  ExistingDataIndexLookupMapper.setSnapshot(job, snapshot);

  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
  }
  Path outputPath = new Path(tmpPath, UUID.randomUUID().toString());
  job.setJarByClass(getClass());
  job.setMapperClass(LookupBuilderMapper.class);
  job.setReducerClass(LookupBuilderReducer.class);

  int shardCount = descriptor.getShardCount();
  job.setNumReduceTasks(shardCount);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(BooleanWritable.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  if (job.waitForCompletion(true)) {
    return new PartitionedInputResult(outputPath, job.getCounters(), shardCount, job.getTaskReports(TaskType.REDUCE));
  } else {
    throw new IOException("Partitioning failed!");
  }
}
 
Example #20
Source File: SparkStorageUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}
 
Example #21
Source File: PcapJob.java    From metron with Apache License 2.0 5 votes vote down vote up
/**
 * Creates, but does not submit the job. This is the core MapReduce mrJob. Empty input path
 * results in a null to be returned instead of creating the job.
 */
public Job createJob(Optional<String> jobName
                    ,Path basePath
                    , Path jobOutputPath
                    , long beginNS
                    , long endNS
                    , int numReducers
                    , T fields
                    , Configuration conf
                    , FileSystem fs
                    , PcapFilterConfigurator<T> filterImpl
                    ) throws IOException
{
  Iterable<String> filteredPaths = FileFilterUtil.getPathsInTimeRange(beginNS, endNS, listFiles(fs, basePath));
  String inputPaths = Joiner.on(',').join(filteredPaths);
  if (StringUtils.isEmpty(inputPaths)) {
    return null;
  }
  conf.set(START_TS_CONF, Long.toUnsignedString(beginNS));
  conf.set(END_TS_CONF, Long.toUnsignedString(endNS));
  conf.set(WIDTH_CONF, "" + findWidth(beginNS, endNS, numReducers));
  filterImpl.addToConfig(fields, conf);
  Job job = Job.getInstance(conf);
  jobName.ifPresent(job::setJobName);
  job.setJarByClass(PcapJob.class);
  job.setMapperClass(PcapJob.PcapMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);
  job.setNumReduceTasks(numReducers);
  job.setReducerClass(PcapReducer.class);
  job.setPartitionerClass(PcapPartitioner.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  SequenceFileInputFormat.addInputPaths(job, inputPaths);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, jobOutputPath);
  return job;
}
 
Example #22
Source File: TeraStreamValidate.java    From pravega-samples with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
  if (args.length != 5) {
    usage();
    return 2;
  }
  LOG.info("starting");
  Path inputDir = new Path(args[0]);
  Path outputDir = new Path(args[1]);
  getConf().setStrings(INPUT_URI_STRING, args[2]);
  getConf().setStrings(INPUT_SCOPE_NAME, args[3]);
  getConf().setStrings(INPUT_STREAM_NAME, args[4]);
  getConf().setStrings(INPUT_DESERIALIZER, TextSerializer.class.getName());

  getConf().setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(getConf());

  TeraInputFormat.setInputPaths(job, inputDir);
  FileOutputFormat.setOutputPath(job, outputDir);

  job.setJobName("TeraStreamValidate");
  job.setJarByClass(TeraStreamValidate.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setMapperClass(TeraSortMapper.class);
  job.setNumReduceTasks(1);

  job.setInputFormatClass(PravegaInputFormat.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);

  int ret = job.waitForCompletion(true) ? 0 : 1;
  LOG.info("done");
  return ret;
}
 
Example #23
Source File: IIDistinctColumnsJob.java    From Kylin with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output) throws IOException {
    job.setReducerClass(IIDistinctColumnsReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.OUTPUT_PATH, output.toString());

    job.setNumReduceTasks(1);

    deletePath(job.getConfiguration(), output);
}
 
Example #24
Source File: MapOperation.java    From incubator-retired-mrql with Apache License 2.0 5 votes vote down vote up
/** The cMap physical operator
 * @param map_fnc       mapper function
 * @param acc_fnc       optional accumulator function
 * @param zero          optional the zero value for the accumulator
 * @param source        input data source
 * @param stop_counter  optional counter used in repeat operation
 * @return a new data source that contains the result
 */
public final static DataSet cMap ( Tree map_fnc,         // mapper function
                                   Tree acc_fnc,         // optional accumulator function
                                   Tree zero,            // optional the zero value for the accumulator
                                   DataSet source,       // input data source
                                   String stop_counter ) // optional counter used in repeat operation
                            throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper",map_fnc.toString());
    conf.set("mrql.counter",stop_counter);
    if (zero != null) {
        conf.set("mrql.accumulator",acc_fnc.toString());
        conf.set("mrql.zero",zero.toString());
    } else conf.set("mrql.zero","");
    setupSplits(source,conf);
    Job job = new Job(conf,newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p: source.source)
        MultipleInputs.addInputPath(job,new Path(p.path),(Class<? extends MapReduceMRQLFileInputFormat>)p.inputFormat,cMapMapper.class);
    FileOutputFormat.setOutputPath(job,new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0
             : job.getCounters().findCounter("mrql",stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath,conf),c,outputRecords(job));
}
 
Example #25
Source File: UHCDictionaryJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example #26
Source File: SimpleExample.java    From hadoop-sstable with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

    long startTime = System.currentTimeMillis();
    Options options = buildOptions();

    CommandLineParser cliParser = new BasicParser();
    CommandLine cli = cliParser.parse(options, args);
    if (cli.getArgs().length < 2) {
        printUsage(options);
    }
    Job job = getJobConf(cli);

    job.setJobName("Simple Example");

    job.setJarByClass(SimpleExample.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(SimpleExampleMapper.class);
    job.setReducerClass(SimpleExampleReducer.class);

    job.setInputFormatClass(SSTableRowInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    String inputPaths = cli.getArgs()[0];
    LOG.info("Setting initial input paths to {}", inputPaths);
    SSTableInputFormat.addInputPaths(job, inputPaths);

    final String outputPath = cli.getArgs()[1];
    LOG.info("Setting initial output paths to {}", outputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    boolean success = job.waitForCompletion(true);
    LOG.info("Total runtime: {}s", (System.currentTimeMillis() - startTime) / 1000);
    return success ? 0 : 1;
}
 
Example #27
Source File: TestMultiMROutput.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test
public void testMergeConf() throws Exception {
  JobConf payloadConf = new JobConf();
  payloadConf.set("local-key", "local-value");
  DataSinkDescriptor dataSink = MultiMROutput.createConfigBuilder(
      payloadConf, SequenceFileOutputFormat.class, "/output", false).build();

  Configuration baseConf = new Configuration(false);
  baseConf.set("base-key", "base-value");

  OutputContext outputContext = mock(OutputContext.class);
  ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
  when(outputContext.getUserPayload()).thenReturn(dataSink.getOutputDescriptor().getUserPayload());
  when(outputContext.getApplicationId()).thenReturn(appId);
  when(outputContext.getTaskVertexIndex()).thenReturn(1);
  when(outputContext.getTaskAttemptNumber()).thenReturn(1);
  when(outputContext.getCounters()).thenReturn(new TezCounters());
  when(outputContext.getStatisticsReporter()).thenReturn(mock(OutputStatisticsReporter.class));
  when(outputContext.getContainerConfiguration()).thenReturn(baseConf);

  MultiMROutput output = new MultiMROutput(outputContext, 2);
  output.initialize();

  Configuration mergedConf = output.jobConf;
  assertEquals("base-value", mergedConf.get("base-key"));
  assertEquals("local-value", mergedConf.get("local-key"));
}
 
Example #28
Source File: TestMROutputLegacy.java    From tez with Apache License 2.0 5 votes vote down vote up
@Test (timeout = 5000)
public void testNewAPI_MR() throws Exception {
  String outputPath = TEST_DIR.getAbsolutePath();
  Job job = Job.getInstance();
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
  job.getConfiguration().setBoolean("mapred.reducer.new-api", true);
  // the output is attached to reducer
  job.getConfiguration().setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
  UserPayload vertexPayload = TezUtils.createUserPayloadFromConf(job.getConfiguration());
  OutputDescriptor od = OutputDescriptor.create(MROutputLegacy.class.getName())
      .setUserPayload(vertexPayload);
  DataSinkDescriptor sink = DataSinkDescriptor.create(od,
      OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null);

  OutputContext outputContext = createMockOutputContext(sink.getOutputDescriptor().getUserPayload());
  MROutputLegacy output = new MROutputLegacy(outputContext, 2);
  output.initialize();
  assertEquals(true, output.useNewApi);
  assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
  assertNull(output.oldOutputFormat);
  assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
  assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
  assertNull(output.oldApiTaskAttemptContext);
  assertNotNull(output.newRecordWriter);
  assertNull(output.oldRecordWriter);
  assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
 
Example #29
Source File: FactDistinctColumnsJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void setupReducer(Path output, CubeSegment cubeSeg)
        throws IOException {
    FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeSeg.getCubeInstance());
    int numberOfReducers = reducerMapping.getTotalReducerNum();
    logger.info("{} has reducers {}.", this.getClass().getName(), numberOfReducers);
    if (numberOfReducers > 250) {
        throw new IllegalArgumentException(
                "The max reducer number for FactDistinctColumnsJob is 250, but now it is "
                        + numberOfReducers
                        + ", decrease 'kylin.engine.mr.uhc-reducer-count'");
    }

    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setPartitionerClass(FactDistinctColumnPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    // make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    // prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}
 
Example #30
Source File: SparkStorageUtils.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
                 Integer maxOutputFiles) {
    path = FilenameUtils.normalize(path, true);
    if (maxOutputFiles != null) {
        rdd = rdd.coalesce(maxOutputFiles);
    }
    JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
    JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
                    dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());

    keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
                    SequenceFileOutputFormat.class);
}