org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FlinkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static DataSet<String[]> readHiveRecords(boolean isSequenceFile, ExecutionEnvironment env, String inputPath, String hiveTable, Job job) throws IOException {
    DataSet<String[]> recordDataSet;

    if (isSequenceFile) {
        recordDataSet = env
                .createInput(HadoopInputs.readHadoopFile(new SequenceFileInputFormat(), BytesWritable.class, Text.class, inputPath, job),
                        TypeInformation.of(new TypeHint<Tuple2<BytesWritable, Text>>() {}))
                .map(new MapFunction<Tuple2<BytesWritable, Text>, String[]>() {
                    @Override
                    public String[] map(Tuple2<BytesWritable, Text> tuple2) throws Exception {

                        String s = Bytes.toString(tuple2.f1.getBytes(), 0, tuple2.f1.getLength());
                        return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
                    }
                });
    } else {
        throw new UnsupportedOperationException("Currently, Flink does not support read hive table directly.");
    }

    return recordDataSet;
}
 
Example #2
Source File: PageRankDriver.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void printFinalRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception {
	Path outFile = new Path (outputPath);
	if (fs.exists(outFile)) {
		fs.delete(outFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setMapperClass(RankPrinter.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);
	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	job.setJarByClass(RankPrinter.class);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, outFile);
	job.waitForCompletion(true);
}
 
Example #3
Source File: FlinkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}
 
Example #4
Source File: FlinkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static DataSet<String[]> readHiveRecords(boolean isSequenceFile, ExecutionEnvironment env, String inputPath, String hiveTable, Job job) throws IOException {
    DataSet<String[]> recordDataSet;

    if (isSequenceFile) {
        recordDataSet = env
                .createInput(HadoopInputs.readHadoopFile(new SequenceFileInputFormat(), BytesWritable.class, Text.class, inputPath, job),
                        TypeInformation.of(new TypeHint<Tuple2<BytesWritable, Text>>() {}))
                .map(new MapFunction<Tuple2<BytesWritable, Text>, String[]>() {
                    @Override
                    public String[] map(Tuple2<BytesWritable, Text> tuple2) throws Exception {

                        String s = Bytes.toString(tuple2.f1.getBytes(), 0, tuple2.f1.getLength());
                        return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
                    }
                });
    } else {
        throw new UnsupportedOperationException("Currently, Flink does not support read hive table directly.");
    }

    return recordDataSet;
}
 
Example #5
Source File: TestJoinDatamerge.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
 
Example #6
Source File: TestJoinDatamerge.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static void joinAs(String jointype, 
    Class<? extends SimpleCheckerMapBase<?>> map, 
    Class<? extends SimpleCheckerReduceBase> reduce) throws Exception {
  final int srcs = 4;
  Configuration conf = new Configuration();
  Path base = cluster.getFileSystem().makeQualified(new Path("/"+jointype));
  Path[] src = writeSimpleSrc(base, conf, srcs);
  conf.set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(jointype,
      SequenceFileInputFormat.class, src));
  conf.setInt("testdatamerge.sources", srcs);
  Job job = Job.getInstance(conf);
  job.setInputFormatClass(CompositeInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(base, "out"));

  job.setMapperClass(map);
  job.setReducerClass(reduce);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(IntWritable.class);
  job.setOutputValueClass(IntWritable.class);
  job.waitForCompletion(true);
  assertTrue("Job failed", job.isSuccessful());
  if ("outer".equals(jointype)) {
    checkOuterConsistency(job, src);
  }
  base.getFileSystem(conf).delete(base, true);
}
 
Example #7
Source File: FlinkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}
 
Example #8
Source File: Step32.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job one");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step32.class);  
    job1.setMapperClass(WikiMapper32.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
  
    job1.setReducerClass(WiKiReducer32.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // the WiKiDriver's out put is this one's input  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example #9
Source File: PageRankDriver.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void calculateNextRanks (Configuration conf, FileSystem fs, String inputPath, String outputPath) throws Exception {
	Path outFile = new Path (outputPath);
	if (fs.exists(outFile)) {
		fs.delete(outFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setJarByClass(PageRankMapper.class);
	job.setMapperClass(PageRankMapper.class);
	job.setReducerClass(PageRankReducer.class);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(Message.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(Message.class);
	job.setInputFormatClass(SequenceFileInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, outFile);
	job.waitForCompletion(true);
}
 
Example #10
Source File: JoinSelectStatsUtil.java    From rya with Apache License 2.0 6 votes vote down vote up
public static void initSumMRJob(Job job, String inputPath, String outtable, String auths) throws AccumuloSecurityException, IOException {
  Configuration conf = job.getConfiguration();
  String username = conf.get(USERNAME);
  String password = conf.get(PASSWORD);
  String instance = conf.get(INSTANCE);
  String zookeepers = conf.get(ZOOKEEPERS);

  if (zookeepers != null) {
    AccumuloOutputFormat.setConnectorInfo(job, username, new PasswordToken(password));
    AccumuloOutputFormat.setZooKeeperInstance(job, instance, zookeepers);
  } else {
    throw new IllegalArgumentException("Must specify zookeepers");
  }

  SequenceFileInputFormat.addInputPath(job, new Path(inputPath));
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapOutputKeyClass(TripleEntry.class);
  job.setMapOutputValueClass(CardList.class);

  AccumuloOutputFormat.setDefaultTableName(job, outtable);
  job.setOutputFormatClass(AccumuloOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Mutation.class);

}
 
Example #11
Source File: TestInputFormatImportCommandCluster.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMRImportWithTransform() throws Exception {
  Path sample = new Path(temp.newFile("sample.sequence").toString())
      .makeQualified(getDFS().getUri(), new Path("/"));
  writeSequenceFile(getDFS(), sample); // HDFS sequence file

  // Reusing records is okay when running in MR
  command.inFormatClass = SequenceFileInputFormat.class.getName();
  command.targets = Lists.newArrayList(sample.toString(), datasetUri);
  command.noCompaction = true; // no need to run reducers
  command.transform = TransformMeasurement.class.getName();

  int rc = command.run();

  Assert.assertEquals("Should return success", 0, rc);

  verify(console).info("Added {} records to \"{}\"", 3L, datasetUri);
  verifyNoMoreInteractions(console);

  Set<Measurement> datasetContent = materialize(
      Datasets.load(datasetUri, Measurement.class));
  Set<Measurement> expected = Sets.newHashSet(Iterables.transform(
      measurements, new TransformMeasurement()));
  Assert.assertEquals(expected, datasetContent);
}
 
Example #12
Source File: KMeansDriver.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void convertPointsSequenceFileToText (Configuration conf, FileSystem fs, String seqFilePath, String outputPath) throws Exception {

		Path seqFile = new Path (seqFilePath);
		Path output = new Path (outputPath);
		if (fs.exists(output)) {
			fs.delete(output, true);
		}
		Job job = Job.getInstance(conf);
		job.setMapperClass(PointSequenceToTextConverter.class);
		job.setReducerClass(Reducer.class);
		job.setNumReduceTasks(0);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Text.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		job.setInputFormatClass(SequenceFileInputFormat.class);
		FileInputFormat.addInputPath(job, seqFile);
		FileOutputFormat.setOutputPath(job, output);
		job.waitForCompletion(true);
	}
 
Example #13
Source File: TestInputFormatImportCommandCluster.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMRImport() throws Exception {
  Path sample = new Path(temp.newFile("sample.sequence").toString())
      .makeQualified(getDFS().getUri(), new Path("/"));
  writeSequenceFile(getDFS(), sample); // HDFS sequence file

  // Reusing records is okay when running in MR
  command.inFormatClass = SequenceFileInputFormat.class.getName();
  command.targets = Lists.newArrayList(sample.toString(), datasetUri);
  command.noCompaction = true; // no need to run reducers

  int rc = command.run();

  Assert.assertEquals("Should return success", 0, rc);

  verify(console).info("Added {} records to \"{}\"", 3L, datasetUri);
  verifyNoMoreInteractions(console);

  Set<Measurement> datasetContent = materialize(
      Datasets.load(datasetUri, Measurement.class));
  Assert.assertEquals(Sets.newHashSet(measurements), datasetContent);
}
 
Example #14
Source File: Step5.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job five");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step5.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper5.class);  
    job1.setMapOutputKeyClass(VarLongWritable.class);  
    job1.setMapOutputValueClass(VectorWritable.class);  
      
    job1.setCombinerClass(WiKiCombiner5.class);  
    job1.setReducerClass(WiKiReducer5.class);  
    job1.setOutputKeyClass(VarLongWritable.class);  
    job1.setOutputValueClass(RecommendedItemsWritable.class);  
//   job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
  
    FileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example #15
Source File: Step31.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job three1");  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step31.class);  
    job1.setMapperClass(WikiMapper31.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);  
      
    // set a reducer only to use SequenceFileOutputFormat  
    job1.setReducerClass(WiKiReducer31.class);  
    job1.setOutputKeyClass(IntWritable.class);  
    job1.setOutputValueClass(VectorOrPrefWritable.class);  
      
    // this MR's input is the MR2's output  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example #16
Source File: Step2.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Configuration conf1 = new Configuration();  

        Job job1 = new Job(conf1, "wiki  job two");  
        job1.setNumReduceTasks(1);  
        job1.setJarByClass(Step2.class);  
        job1.setInputFormatClass(SequenceFileInputFormat.class);  
        job1.setMapperClass(WikiMapper2.class);  
        job1.setMapOutputKeyClass(IntWritable.class);  
        job1.setMapOutputValueClass(IntWritable.class);  
        job1.setReducerClass(WiKiReducer2.class);  
        job1.setOutputKeyClass(IntWritable.class);  
        job1.setOutputValueClass(VectorWritable.class);  
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
        SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
        SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
        if(!job1.waitForCompletion(true)){  
            System.exit(1); // run error then exit  
        }  
    }
 
Example #17
Source File: Step4.java    From recsys-offline with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  
    String[] otherArgs = new GenericOptionsParser(conf1, args).getRemainingArgs();        
    Job job1 = new Job(conf1, "wiki  job four");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step4.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper4.class);  
    job1.setMapOutputKeyClass(IntWritable.class);  
    job1.setMapOutputValueClass(VectorOrPrefWritable.class);      
    job1.setReducerClass(WiKiReducer4.class);  
    job1.setOutputKeyClass(IntWritable.class);  
   job1.setOutputValueClass(VectorAndPrefsWritable.class);  
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT1_PATH));  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT2_PATH));  
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}
 
Example #18
Source File: SequenceFileStockMapReduce.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
/**
 * Write the sequence file.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(SequenceFileStockMapReduce.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(StockPriceWritable.class);
  job.setInputFormatClass(
      SequenceFileInputFormat.class); //<co id="ch03_comment_seqfile_mr1"/>
  job.setOutputFormatClass(SequenceFileOutputFormat.class);  //<co id="ch03_comment_seqfile_mr2"/>
  SequenceFileOutputFormat.setCompressOutput(job, true);  //<co id="ch03_comment_seqfile_mr3"/>
  SequenceFileOutputFormat.setOutputCompressionType(job,  //<co id="ch03_comment_seqfile_mr4"/>
      SequenceFile.CompressionType.BLOCK);
  SequenceFileOutputFormat.setOutputCompressorClass(job,  //<co id="ch03_comment_seqfile_mr5"/>
      DefaultCodec.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}
 
Example #19
Source File: IndexerJobDriver.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private boolean runMrOnly(TableDescriptor descriptor, List<Path> inprogressPathList, String table, Path fileCache,
    Path outputPath, int reducerMultipler) throws IOException, ClassNotFoundException, InterruptedException {
  Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");
  Path tablePath = new Path(descriptor.getTableUri());
  BlurInputFormat.setLocalCachePath(job, fileCache);
  BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
  MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, ExistingDataMapper.class);

  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
    MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
  }

  BlurOutputFormat.setOutputPath(job, outputPath);
  BlurOutputFormat.setupJob(job, descriptor);

  job.setReducerClass(UpdateReducer.class);
  job.setMapOutputKeyClass(IndexKey.class);
  job.setMapOutputValueClass(IndexValue.class);
  job.setPartitionerClass(IndexKeyPartitioner.class);
  job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

  BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

  boolean success = job.waitForCompletion(true);
  Counters counters = job.getCounters();
  LOG.info("Counters [" + counters + "]");
  return success;
}
 
Example #20
Source File: IndexerJobDriver.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private boolean runMrWithLookup(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table,
    Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot)
    throws ClassNotFoundException, IOException, InterruptedException {
  PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot,
      fileCache);

  Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");

  ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT);
  FileInputFormat.addInputPath(job, result._partitionedInputData);
  MultipleInputs.addInputPath(job, result._partitionedInputData, SequenceFileInputFormat.class,
      ExistingDataIndexLookupMapper.class);

  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
    MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
  }

  BlurOutputFormat.setOutputPath(job, outputPath);
  BlurOutputFormat.setupJob(job, descriptor);

  job.setReducerClass(UpdateReducer.class);
  job.setMapOutputKeyClass(IndexKey.class);
  job.setMapOutputValueClass(IndexValue.class);
  job.setPartitionerClass(IndexKeyPartitioner.class);
  job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

  BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

  boolean success = job.waitForCompletion(true);
  Counters counters = job.getCounters();
  LOG.info("Counters [" + counters + "]");
  return success;
}
 
Example #21
Source File: IndexerJobDriver.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private PartitionedInputResult buildPartitionedInputData(String uuid, Path tmpPath, TableDescriptor descriptor,
    List<Path> inprogressPathList, String snapshot, Path fileCachePath) throws IOException, ClassNotFoundException,
    InterruptedException {
  Job job = Job.getInstance(getConf(), "Partitioning data for table [" + descriptor.getName() + "]");
  job.getConfiguration().set(BLUR_UPDATE_ID, uuid);

  // Needed for the bloom filter path information.
  BlurOutputFormat.setTableDescriptor(job, descriptor);
  BlurInputFormat.setLocalCachePath(job, fileCachePath);
  ExistingDataIndexLookupMapper.setSnapshot(job, snapshot);

  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
  }
  Path outputPath = new Path(tmpPath, UUID.randomUUID().toString());
  job.setJarByClass(getClass());
  job.setMapperClass(LookupBuilderMapper.class);
  job.setReducerClass(LookupBuilderReducer.class);

  int shardCount = descriptor.getShardCount();
  job.setNumReduceTasks(shardCount);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(BooleanWritable.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  if (job.waitForCompletion(true)) {
    return new PartitionedInputResult(outputPath, job.getCounters(), shardCount, job.getTaskReports(TaskType.REDUCE));
  } else {
    throw new IOException("Partitioning failed!");
  }
}
 
Example #22
Source File: SampleUploader.java    From hbase with Apache License 2.0 5 votes vote down vote up
/**
 * Job configuration.
 */
public static Job configureJob(Configuration conf, String [] args) throws IOException {
  Path inputPath = new Path(args[0]);
  String tableName = args[1];
  Job job = new Job(conf, NAME + "_" + tableName);
  job.setJarByClass(Uploader.class);
  FileInputFormat.setInputPaths(job, inputPath);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapperClass(Uploader.class);
  // No reducers.  Just write straight to table.  Call initTableReducerJob
  // because it sets up the TableOutputFormat.
  TableMapReduceUtil.initTableReducerJob(tableName, null, job);
  job.setNumReduceTasks(0);
  return job;
}
 
Example #23
Source File: JoinSelectStatisticsTest.java    From rya with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    String outpath = conf.get(OUTPUTPATH);
    
    Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
    job.setJarByClass(this.getClass());
    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);
    
    MultipleInputs.addInputPath(job, new Path(PROSPECTSOUT.getAbsolutePath()), 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    MultipleInputs.addInputPath(job,new Path(SPOOUT.getAbsolutePath()) , 
            SequenceFileInputFormat.class, JoinSelectAggregateMapper.class);
    job.setMapOutputKeyClass(CompositeType.class);
    job.setMapOutputValueClass(TripleCard.class);

    tempDir = new File(File.createTempFile(outpath, "txt").getParentFile(), System.currentTimeMillis() + "");
    SequenceFileOutputFormat.setOutputPath(job, new Path(tempDir.getAbsolutePath()));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(TripleEntry.class);
    job.setOutputValueClass(CardList.class);


    job.setSortComparatorClass(JoinSelectSortComparator.class);
    job.setGroupingComparatorClass(JoinSelectGroupComparator.class);
    job.setPartitionerClass(JoinSelectPartitioner.class);
    job.setReducerClass(JoinReducer.class);
    job.setNumReduceTasks(32);
    job.waitForCompletion(true);
    
    return job.isSuccessful() ? 0 : 1;          
}
 
Example #24
Source File: JoinSelectStatsUtil.java    From rya with Apache License 2.0 5 votes vote down vote up
public static void initJoinMRJob(Job job, String prospectsPath, String spoPath, Class<? extends Mapper<CompositeType,TripleCard,?,?>> mapperClass,
    String outPath, String auths) throws AccumuloSecurityException {

  MultipleInputs.addInputPath(job, new Path(prospectsPath), SequenceFileInputFormat.class, mapperClass);
  MultipleInputs.addInputPath(job, new Path(spoPath), SequenceFileInputFormat.class, mapperClass);
  job.setMapOutputKeyClass(CompositeType.class);
  job.setMapOutputValueClass(TripleCard.class);

  SequenceFileOutputFormat.setOutputPath(job, new Path(outPath));
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(TripleEntry.class);
  job.setOutputValueClass(CardList.class);

}
 
Example #25
Source File: FileSystemStorageCheck.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Test
@LoadGraphWith(LoadGraphWith.GraphData.MODERN)
public void shouldSupportCopyMethods() throws Exception {
    // Make sure Spark is shut down before deleting its files and directories,
    // which are locked under Windows and fail the tests. See FileSystemStorageCheck
    graph.configuration().setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false);

    final Storage storage = FileSystemStorage.open(ConfUtil.makeHadoopConfiguration(graph.configuration()));
    final String outputLocation = graph.configuration().getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
    final String newOutputLocation = TestHelper.makeTestDataDirectory(FileSystemStorageCheck.class, "new-location-for-copy");
    // TestHelper creates the directory and we need it not to exist
    deleteDirectory(newOutputLocation);
    super.checkCopyMethods(storage, outputLocation, newOutputLocation, InputOutputHelper.getInputFormat((Class) Class.forName(graph.configuration().getString(Constants.GREMLIN_HADOOP_GRAPH_WRITER))), SequenceFileInputFormat.class);

}
 
Example #26
Source File: InputFormatRDD.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Override
public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) {
    return sparkContext.newAPIHadoopRDD(ConfUtil.makeHadoopConfiguration(configuration),
            SequenceFileInputFormat.class,
            ObjectWritable.class,
            ObjectWritable.class)
            .mapToPair(tuple -> new Tuple2<>((K) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._1().get(), (V) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._2().get()));
}
 
Example #27
Source File: FileSystemStorage.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Override
public <K, V> Iterator<KeyValue<K, V>> head(final String location, final String memoryKey, final Class readerClass, final int totalLines) {
    if (!readerClass.equals(SequenceFileInputFormat.class))
        throw new IllegalArgumentException("Only " + SequenceFileInputFormat.class.getCanonicalName() + " memories are supported");
    final Configuration configuration = new Configuration();
    try {
        return IteratorUtils.limit((Iterator) new ObjectWritableIterator(configuration, new Path(Constants.getMemoryLocation(location, memoryKey))), totalLines);
    } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}
 
Example #28
Source File: SparkStorageUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) {
    Configuration c = new Configuration();
    c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
    JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class,
                    LongWritable.class, SequenceRecordWritable.class);

    return pairRDD.mapToPair(new SequenceRecordLoadPairFunction());
}
 
Example #29
Source File: FileSystemStorageCheck.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
@Test
@LoadGraphWith(LoadGraphWith.GraphData.MODERN)
public void shouldSupportHeadMethods() throws Exception {
    // Make sure Spark is shut down before deleting its files and directories,
    // which are locked under Windows and fail the tests. See FileSystemStorageCheck
    graph.configuration().setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false);

    final Storage storage = FileSystemStorage.open(ConfUtil.makeHadoopConfiguration(graph.configuration()));
    final String inputLocation = Constants.getSearchGraphLocation(graph.configuration().getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), storage).get();
    final String outputLocation = graph.configuration().getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
    // TestHelper creates the directory and we need it not to exist
    deleteDirectory(outputLocation);
    super.checkHeadMethods(storage, inputLocation, outputLocation, InputOutputHelper.getInputFormat((Class) Class.forName(graph.configuration().getString(Constants.GREMLIN_HADOOP_GRAPH_WRITER))), SequenceFileInputFormat.class);
}
 
Example #30
Source File: KafkaMRInput.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public void configureJob(Job job) {
    job.setInputFormatClass(SequenceFileInputFormat.class);
    String jobId = job.getConfiguration().get(BatchConstants.ARG_CUBING_JOB_ID);
    IJoinedFlatTableDesc flatHiveTableDesc = new CubeJoinedFlatTableDesc(cubeSegment);
    String inputPath = JoinedFlatTable.getTableDir(flatHiveTableDesc,
            JobBuilderSupport.getJobWorkingDir(conf, jobId));
    try {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}