org.apache.hadoop.mapreduce.lib.input.FileInputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: CommentWordCount.java From hadoop-map-reduce-patterns with Apache License 2.0

7 votes

@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, arg0).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: CommentWordCount <in> <out>");
		System.exit(2);
	}
	Job job = new Job(conf, "StackOverflow Comment Word Count");
	job.setJarByClass(CommentWordCount.class);
	job.setMapperClass(WordCountMapper.class);
	job.setCombinerClass(WordCountReducer.class);
	job.setReducerClass(WordCountReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}

Example #2

Source File: DT_ID3_Driver.java From MLHadoop with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
	Configuration conf=new Configuration();
	Job job = new Job(conf);
	job.setJarByClass(DT_ID3_Driver.class);
	job.setJobName("Decision_Tree_Algorithm_on_Hadoop");
	FileInputFormat.setInputPaths(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));
	//job.setNumReduceTasks(0);
	job.setMapperClass(DT_ID3_Map.class);
	job.setReducerClass(DT_ID3_Reduce.class);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);
	boolean success = job.waitForCompletion(true);
	System.exit(success ? 0 : 1);
}

Example #3

Source File: WordCount.java From wifi with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//		System.out.println(otherArgs);
		if(otherArgs.length != 2) {
			System.out.println("Usage:wordcount <in> <out>");
			System.exit(2);
		}
//		if(args.length != 2) {
//			System.out.println("param error!");
//			System.exit(-1);
//		}
		
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}

Example #4

Source File: Main.java From hiped2 with Apache License 2.0

6 votes

public static boolean runCalcJob(Configuration conf, Path input, Path outputPath)
    throws Exception {

  Job job = new Job(conf);
  job.setJarByClass(Main.class);
  job.setMapperClass(CalcMapReduce.Map.class);
  job.setReducerClass(CalcMapReduce.Reduce.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setMapOutputKeyClass(CalcMapReduce.TextPair.class);
  job.setMapOutputValueClass(IntWritable.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, outputPath);

  return job.waitForCompletion(true);
}

Example #5

Source File: MapReduceTestUtil.java From big-c with Apache License 2.0

6 votes

/**
 * Creates a simple fail job.
 * 
 * @param conf Configuration object
 * @param outdir Output directory.
 * @param indirs Comma separated input directories.
 * @return Job initialized for a simple kill job.
 * @throws Exception If an error occurs creating job configuration.
 */
public static Job createKillJob(Configuration conf, Path outdir, 
    Path... indirs) throws Exception {

  Job theJob = Job.getInstance(conf);
  theJob.setJobName("Kill-Job");

  FileInputFormat.setInputPaths(theJob, indirs);
  theJob.setMapperClass(KillMapper.class);
  theJob.setReducerClass(Reducer.class);
  theJob.setNumReduceTasks(0);
  FileOutputFormat.setOutputPath(theJob, outdir);
  theJob.setOutputKeyClass(Text.class);
  theJob.setOutputValueClass(Text.class);
  return theJob;
}

Example #6

Source File: GridmixJob.java From RDFS with Apache License 2.0

6 votes

public Job call() throws IOException, InterruptedException,
                         ClassNotFoundException {
  job.setMapperClass(GridmixMapper.class);
  job.setReducerClass(GridmixReducer.class);
  job.setNumReduceTasks(jobdesc.getNumberReduces());
  job.setMapOutputKeyClass(GridmixKey.class);
  job.setMapOutputValueClass(GridmixRecord.class);
  job.setSortComparatorClass(GridmixKey.Comparator.class);
  job.setGroupingComparatorClass(SpecGroupingComparator.class);
  job.setInputFormatClass(GridmixInputFormat.class);
  job.setOutputFormatClass(RawBytesOutputFormat.class);
  job.setPartitionerClass(DraftPartitioner.class);
  job.setJarByClass(GridmixJob.class);
  job.getConfiguration().setInt("gridmix.job.seq", seq);
  job.getConfiguration().set(ORIGNAME, null == jobdesc.getJobID()
      ? "<unknown>" : jobdesc.getJobID().toString());
  job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true);
  FileInputFormat.addInputPath(job, new Path("ignored"));
  FileOutputFormat.setOutputPath(job, outdir);
  job.submit();
  return job;
}

Example #7

Source File: MapReduceTest.java From vespa with Apache License 2.0

6 votes

@Test
public void requireThatMapOnlyJobSucceeds() throws Exception {
    Job job = Job.getInstance(conf);
    job.setJarByClass(MapReduceTest.class);
    job.setMapperClass(FeedMapper.class);
    job.setOutputFormatClass(VespaOutputFormat.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, metricsJsonPath);

    boolean success = job.waitForCompletion(true);
    assertTrue("Job Failed", success);

    VespaCounters counters = VespaCounters.get(job);
    assertEquals(10, counters.getDocumentsSent());
    assertEquals(0, counters.getDocumentsFailed());
    assertEquals(10, counters.getDocumentsOk());
}

Example #8

Source File: HadoopFileInputSource.java From incubator-gobblin with Apache License 2.0

6 votes

@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
  if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
    throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
  }

  Configuration configuration = new Configuration();
  FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration);

  String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
  FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
  TaskAttemptContext taskAttemptContext =
      getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
  try {
    RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Example #9

Source File: SleepJob.java From hadoop with Apache License 2.0

6 votes

public Job createJob(int numMapper, int numReducer, 
                     long mapSleepTime, int mapSleepCount, 
                     long reduceSleepTime, int reduceSleepCount) 
    throws IOException {
  Configuration conf = getConf();
  conf.setLong(MAP_SLEEP_TIME, mapSleepTime);
  conf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
  conf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
  conf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
  conf.setInt(MRJobConfig.NUM_MAPS, numMapper);
  Job job = Job.getInstance(conf, "sleep");
  job.setNumReduceTasks(numReducer);
  job.setJarByClass(SleepJob.class);
  job.setMapperClass(SleepMapper.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(SleepReducer.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setInputFormatClass(SleepInputFormat.class);
  job.setPartitionerClass(SleepJobPartitioner.class);
  job.setSpeculativeExecution(false);
  job.setJobName("Sleep job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  return job;
}

Example #10

Source File: WordCount.java From RDFS with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length != 2) {
    System.err.println("Usage: wordcount <in> <out>");
    System.exit(2);
  }
  Job job = new Job(conf, "word count");
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  long t1 = System.currentTimeMillis();
  boolean re = job.waitForCompletion(true); 
  long t2 = System.currentTimeMillis();
  System.out.println((float)(t2-t1)/1000);
  if (re)
    System.exit(0);
  else
    System.exit(1);
}

Example #11

Source File: MRCompactorAvroKeyDedupJobRunner.java From incubator-gobblin with Apache License 2.0

6 votes

public static Schema getNewestSchemaFromSource(Job job, FileSystem fs) throws IOException {
  Path[] sourceDirs = FileInputFormat.getInputPaths(job);

  List<FileStatus> files = new ArrayList<FileStatus>();

  for (Path sourceDir : sourceDirs) {
    files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
  }

  Collections.sort(files, new LastModifiedDescComparator());

  for (FileStatus file : files) {
    Schema schema = getNewestSchemaFromSource(file.getPath(), fs);
    if (schema != null) {
      return schema;
    }
  }
  return null;
}

Example #12

Source File: MaxTemperature.java From BigData-In-Practice with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    if (args.length != 2) {
        System.err.println("Usage: MaxTemperature <input path> <output path>");
        System.exit(-1);
    }
    Job job = Job.getInstance();
    job.setJarByClass(MaxTemperature.class);
    job.setJobName("MapReduce实验-气象数据集-求气温最大值");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(TemperatureMapper.class);
    // 设置 Combiner 减少数据的传输量、提高效率
    // job.setCombinerClass(MaxTemperatureReducer.class);
    job.setReducerClass(MaxTemperatureReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Example #13

Source File: Step1.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
    Configuration conf1 = new Configuration();

    Job job1 = new Job(conf1, "step1");
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);
    job1.setNumReduceTasks(1);
    job1.setJarByClass(Step1.class);
    job1.setMapperClass(WikiMapper1.class);
    job1.setMapOutputKeyClass(VarLongWritable.class);
    job1.setMapOutputValueClass(LongAndFloat.class);
    job1.setReducerClass(WiKiReducer1.class);
    job1.setOutputKeyClass(VarLongWritable.class);
    job1.setOutputValueClass(VectorWritable.class);

    FileInputFormat.addInputPath(job1, new Path( INPUT_PATH ) );
    SequenceFileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH ));
    if (!job1.waitForCompletion(true)) {
        System.exit(1);
    }
}

Example #14

Source File: TestFileInputFormat.java From hadoop with Apache License 2.0

6 votes

@Test
public void testListStatusNestedRecursive() throws IOException {
  Configuration conf = new Configuration();
  conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);

  List<Path> expectedPaths = org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat
      .configureTestNestedRecursive(conf, localFs);
  JobConf jobConf = new JobConf(conf);
  TextInputFormat fif = new TextInputFormat();
  fif.configure(jobConf);
  FileStatus[] statuses = fif.listStatus(jobConf);

  org.apache.hadoop.mapreduce.lib.input.TestFileInputFormat
      .verifyFileStatuses(expectedPaths, Lists.newArrayList(statuses),
          localFs);
}

Example #15

Source File: TestImportJob.java From aliyun-maxcompute-data-collectors with Apache License 2.0

6 votes

@Override
public void configureInputFormat(Job job, String tableName,
    String tableClassName, String splitByCol)
    throws ClassNotFoundException, IOException {

  // Write a line of text into a file so that we can get
  // a record to the map task.
  Path dir = new Path(this.options.getTempDir());
  Path p = new Path(dir, "sqoop-dummy-import-job-file.txt");
  FileSystem fs = FileSystem.getLocal(this.options.getConf());
  if (fs.exists(p)) {
    boolean result = fs.delete(p, false);
    assertTrue("Couldn't delete temp file!", result);
  }

  BufferedWriter w = new BufferedWriter(
      new OutputStreamWriter(fs.create(p)));
  w.append("This is a line!");
  w.close();

  FileInputFormat.addInputPath(job, p);

  // And set the InputFormat itself.
  super.configureInputFormat(job, tableName, tableClassName, splitByCol);
}

Example #16

Source File: Average.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, arg0).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: Average <in> <out>");
		System.exit(2);
	}
	Job job = new Job(conf, "StackOverflow Comment Average");
	job.setJarByClass(Average.class);
	job.setMapperClass(AverageMapper.class);
	job.setCombinerClass(AverageReducer.class);
	job.setReducerClass(AverageReducer.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(CountAverageTuple.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}

Example #17

Source File: WARCRecordCounter.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance(conf);
    job.setJarByClass(WARCRecordCounter.class);

    job.setJobName(WARCRecordCounter.class.getName());

    // mapper
    job.setMapperClass(ResponseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // combiner + reducer
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Example #18

Source File: TestFileInputFormat.java From hadoop with Apache License 2.0

5 votes

private Configuration getConfiguration() {
  Configuration conf = new Configuration();
  conf.set("fs.test.impl.disable.cache", "true");
  conf.setClass("fs.test.impl", MockFileSystem.class, FileSystem.class);
  conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
      "test:///a1");
  return conf;
}

Example #19

Source File: TestSpeculativeExecution.java From hadoop with Apache License 2.0

5 votes

private Job runSpecTest(boolean mapspec, boolean redspec)
    throws IOException, ClassNotFoundException, InterruptedException {

  Path first = createTempFile("specexec_map_input1", "a\nz");
  Path secnd = createTempFile("specexec_map_input2", "a\nz");

  Configuration conf = mrCluster.getConfig();
  conf.setBoolean(MRJobConfig.MAP_SPECULATIVE,mapspec);
  conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE,redspec);
  conf.setClass(MRJobConfig.MR_AM_TASK_ESTIMATOR,
          TestSpecEstimator.class,
          TaskRuntimeEstimator.class);

  Job job = Job.getInstance(conf);
  job.setJarByClass(TestSpeculativeExecution.class);
  job.setMapperClass(SpeculativeMapper.class);
  job.setReducerClass(SpeculativeReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setNumReduceTasks(2);
  FileInputFormat.setInputPaths(job, first);
  FileInputFormat.addInputPath(job, secnd);
  FileOutputFormat.setOutputPath(job, TEST_OUT_DIR);

  // Delete output directory if it exists.
  try {
    localFs.delete(TEST_OUT_DIR,true);
  } catch (IOException e) {
    // ignore
  }

  // Creates the Job Configuration
  job.addFileToClassPath(APP_JAR); // The AppMaster jar itself.
  job.setMaxMapAttempts(2);

  job.submit();

  return job;
}

Example #20

Source File: CalculateSimilarityStep5.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step5InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step5OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");

	Job job = Job.getInstance(conf);
	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step5_Mapper.class);

	job.setJarByClass(CalculateSimilarityStep5.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Example #21

Source File: ParseLogJob.java From 163-bigdate-note with GNU General Public License v3.0

5 votes

public int run(String[] args) throws Exception {
        //创建job
        Configuration config = getConf();
        Job job = Job.getInstance(config);
        //通过job设置一些参数
        job.setJarByClass(ParseLogJob.class);
        job.setJobName("parselog");
        job.setMapperClass(LogMapper.class);
        //设置reduce个数为0
//        job.setNumReduceTasks(0);
        job.setReducerClass(LogReducer.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(LogBeanWritable.class);
        job.setOutputValueClass(Text.class);


        //添加输入和输出数据
        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path outputPath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outputPath);
        FileSystem fs = FileSystem.get(config);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }


        //运行程序
        if (!job.waitForCompletion(true)) {
            throw new RuntimeException(job.getJobName() + "failed!");
        }
        return 0;
    }

Example #22

Source File: MrjobRemoteCommitDemo.java From bigdata-tutorial with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

		System.setProperty("HADOOP_USER_NAME", "hadoop");
		Configuration conf = new Configuration();

		//远程发布mr时配置
		conf.set("fs.defaultFS", "hdfs://192.168.1.191:9000");
		conf.set("hadoop.job.user", "hadoop");
		conf.set("mapreduce.framework.name", "yarn");
		conf.set("yarn.resourcemanager.hostname", "192.168.1.191");
		conf.set("mapred.jar", "out/artifacts/hadoop2-demo.jar");
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage: MrjobRemoteCommitDemo <in> <out>");
			System.exit(2);
		}
		Job job = Job.getInstance(conf, MrjobRemoteCommitDemo.class.getName());
		job.setJarByClass(MrjobRemoteCommitDemo.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

Example #23

Source File: ConfigurableHDFSFileSource.java From components with Apache License 2.0

5 votes

protected List<InputSplit> computeSplits(long desiredBundleSizeBytes) throws IOException, IllegalAccessException,
        InstantiationException {
    Job job = jobInstance();
    FileInputFormat.setMinInputSplitSize(job, desiredBundleSizeBytes);
    FileInputFormat.setMaxInputSplitSize(job, desiredBundleSizeBytes);
    return createFormat(job).getSplits(job);
}

Example #24

Source File: InvertedJob.java From MapReduce-Demo with MIT License

5 votes

public static void main(String[] args) throws Exception {		
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";			
	Configuration conf = new Configuration();		
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");
	
	String jobName = "InvertedJob";
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(InvertedJob.class);
	job.setJar("export\\InvertedJob.jar");
	job.setMapperClass(InvertedMapper.class);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setCombinerClass(InvertedCombiner.class);	//此处定义Combiner类，与Reducer类不同
	job.setReducerClass(InvertedReducer.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);
	
	String dataDir = "/expr/inverted/data";	
	String outputDir = "/expr/inverted/output";
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);		
	FileSystem fs = FileSystem.get(conf);
	if(fs.exists(outPath)) {
		fs.delete(outPath, true);
	}

	System.out.println("Job: " + jobName + " is running...");
	if(job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}		
}

Example #25

Source File: TestLocalRunner.java From RDFS with Apache License 2.0

5 votes

/**
 * Run a test with several mappers in parallel, operating at different
 * speeds. Verify that the correct amount of output is created.
 */
@Test
public void testMultiMaps() throws Exception {
  Path inputPath = createMultiMapsInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  conf.setBoolean("mapred.localrunner.sequential", false);
  conf.setBoolean("mapred.localrunner.debug", true);
  conf.setInt(LocalJobRunner.LOCAL_RUNNER_SLOTS, 6);
  conf.set(JobConf.MAPRED_TASK_JAVA_OPTS, "-DtestProperty=testValue");
  Job job = new Job(conf);
  job.setMapperClass(StressMapper.class);
  job.setReducerClass(CountingReducer.class);
  job.setNumReduceTasks(1);
  job.getConfiguration().set("io.sort.record.pct", "0.50");
  job.getConfiguration().set("io.sort.mb", "25");
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  FileSystem fs = FileSystem.getLocal(conf);
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.waitForCompletion(true);

  verifyOutput(outputPath);
}

Example #26

Source File: DistributedSSTableIndexIndexer.java From hadoop-sstable with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
    if (args.length == 0 || (args.length == 1 && "--help".equals(args[0]))) {
        printUsage();
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }

    List<Path> inputPaths = new ArrayList<Path>();
    for (String strPath : args) {
        walkPath(new Path(strPath), nonTemporaryFilter, inputPaths);
    }

    if (inputPaths.isEmpty()) {
        System.err.println("No input paths found - perhaps all " +
                ".sstable files have already been indexed.");
        return 0;
    }

    Configuration conf = new Configuration();
    Job job = new Job(conf);
    job.setJobName("Distributed SSTable Indexer " + Arrays.toString(args));

    job.setOutputKeyClass(Path.class);
    job.setOutputValueClass(LongWritablePair.class);

    job.getConfiguration().setBoolean(
            "mapred.map.tasks.speculative.execution", false);

    job.setJarByClass(DistributedSSTableIndexIndexer.class);
    job.setInputFormatClass(SSTableSplitInputFormat.class);
    job.setOutputFormatClass(SSTableIndexOutputFormat.class);
    job.setNumReduceTasks(0);
    job.setMapperClass(Mapper.class);

    for (Path p : inputPaths) {
        FileInputFormat.addInputPath(job, p);
    }

    return job.waitForCompletion(true) ? 0 : 1;
}

Example #27

Source File: SamplerJob.java From hiped2 with Apache License 2.0

5 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(SamplerJob.class);

  ReservoirSamplerInputFormat.setInputFormat(job,
      TextInputFormat.class);

  ReservoirSamplerInputFormat.setNumSamples(job, 10);
  ReservoirSamplerInputFormat.setMaxRecordsToRead(job, 10000);
  ReservoirSamplerInputFormat.
      setUseSamplesNumberPerInputSplit(job, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}

Example #28

Source File: ConfigurationProxyV2.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

@Override
public void setInputPaths( org.pentaho.hadoop.shim.api.internal.fs.Path... paths ) {
  if ( paths == null ) {
    return;
  }
  Path[] actualPaths = new Path[ paths.length ];
  for ( int i = 0; i < paths.length; i++ ) {
    actualPaths[ i ] = ShimUtils.asPath( paths[ i ] );
  }
  try {
    FileInputFormat.setInputPaths( getJob(), actualPaths );
  } catch ( IOException e ) {
    e.printStackTrace();
  }
}

Example #29

Source File: TestLocalRunner.java From big-c with Apache License 2.0

5 votes

/**
 * Test that the GC counter actually increments when we know that we've
 * spent some time in the GC during the mapper.
 */
@Test
public void testGcCounter() throws Exception {
  Path inputPath = getInputPath();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  // Clear input/output dirs.
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  if (fs.exists(inputPath)) {
    fs.delete(inputPath, true);
  }

  // Create one input file
  createInputFile(inputPath, 0, 20);

  // Now configure and run the job.
  Job job = Job.getInstance();
  job.setMapperClass(GCMapper.class);
  job.setNumReduceTasks(0);
  job.getConfiguration().set(MRJobConfig.IO_SORT_MB, "25");
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  boolean ret = job.waitForCompletion(true);
  assertTrue("job failed", ret);

  // This job should have done *some* gc work.
  // It had to clean up 400,000 objects.
  // We strongly suspect this will result in a few milliseconds effort.
  Counter gcCounter = job.getCounters().findCounter(
      TaskCounter.GC_TIME_MILLIS);
  assertNotNull(gcCounter);
  assertTrue("No time spent in gc", gcCounter.getValue() > 0);
}

Example #30

Source File: IAndKMatrixMultiplicationStep1.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1");
	String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2");
	String outputPath = ItemBasedCFDriver.path.get("step7OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ",");

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step1_Mapper.class);
	job.setJarByClass(IAndKMatrixMultiplicationStep1.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path(
			inputPath2));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);

}