Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPath()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCount.java    From wifi with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//		System.out.println(otherArgs);
		if(otherArgs.length != 2) {
			System.out.println("Usage:wordcount <in> <out>");
			System.exit(2);
		}
//		if(args.length != 2) {
//			System.out.println("param error!");
//			System.exit(-1);
//		}
		
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}
 
Example 2
Source File: MedianStdDev.java    From hadoop-map-reduce-patterns with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: MedianStdDev <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	Job job = new Job(conf,
			"StackOverflow Median and Standard Deviation Comment Length By Hour");
	job.setJarByClass(MedianStdDev.class);
	job.setMapperClass(MedianStdDevMapper.class);
	job.setReducerClass(MedianStdDevReducer.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}
 
Example 3
Source File: WordCount.java    From knox with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length != 2) {
    System.err.println( "Usage: wordcount <in> <out>" );
    System.exit(2);
  }
  Job job = Job.getInstance(conf, "Word Count");
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 4
Source File: CredentialsTestJob.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public Job createJob() 
throws IOException {
  Configuration conf = getConf();
  conf.setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(conf, "test");
  job.setNumReduceTasks(1);
  job.setJarByClass(CredentialsTestJob.class);
  job.setNumReduceTasks(1);
  job.setMapperClass(CredentialsTestJob.CredentialsTestMapper.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(CredentialsTestJob.CredentialsTestReducer.class);
  job.setInputFormatClass(SleepJob.SleepInputFormat.class);
  job.setPartitionerClass(SleepJob.SleepJobPartitioner.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("test job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  return job;
}
 
Example 5
Source File: WordMean.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: wordmean <in> <out>");
    return 0;
  }

  Configuration conf = getConf();

  Job job = Job.getInstance(conf, "word mean");
  job.setJarByClass(WordMean.class);
  job.setMapperClass(WordMeanMapper.class);
  job.setCombinerClass(WordMeanReducer.class);
  job.setReducerClass(WordMeanReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  Path outputpath = new Path(args[1]);
  FileOutputFormat.setOutputPath(job, outputpath);
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);

  Boolean waitforCompletion = job.waitForCompletion(true) ;

  Date end_time = new Date();
  System.out.println("Job ended: " + end_time);
  System.out.println("The job took " +
      (end_time.getTime() - startTime.getTime()) /1000 + " seconds.");
 
  

  return (waitforCompletion? 0 : 1);
}
 
Example 6
Source File: TestLocalModeWithNewApis.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testNewApis() throws Exception {
  Random r = new Random(System.currentTimeMillis());
  Path tmpBaseDir = new Path("/tmp/wc-" + r.nextInt());
  final Path inDir = new Path(tmpBaseDir, "input");
  final Path outDir = new Path(tmpBaseDir, "output");
  String input = "The quick brown fox\nhas many silly\nred fox sox\n";
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  Job job = Job.getInstance(conf, "word count");
  job.setJarByClass(TestLocalModeWithNewApis.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);
  assertEquals(job.waitForCompletion(true), true);

  String output = readOutput(outDir, conf);
  assertEquals("The\t1\nbrown\t1\nfox\t2\nhas\t1\nmany\t1\n" +
               "quick\t1\nred\t1\nsilly\t1\nsox\t1\n", output);
  
  outFs.delete(tmpBaseDir, true);
}
 
Example 7
Source File: TestMapperReducerCleanup.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapCleanup() throws Exception {
  reset();
  
  Job job = Job.getInstance();

  Path inputPath = createInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setMapperClass(FailingMapper.class);
  job.setInputFormatClass(TrackingTextInputFormat.class);
  job.setOutputFormatClass(TrackingTextOutputFormat.class);
  job.setNumReduceTasks(0);
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);

  Assert.assertTrue(mapCleanup);
  Assert.assertTrue(recordReaderCleanup);
  Assert.assertTrue(recordWriterCleanup);
}
 
Example 8
Source File: TestReporter.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testStatusLimit() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path test = new Path(testRootTempDir, "testStatusLimit");

  Configuration conf = new Configuration();
  Path inDir = new Path(test, "in");
  Path outDir = new Path(test, "out");
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(inDir)) {
    fs.delete(inDir, true);
  }
  fs.mkdirs(inDir);
  DataOutputStream file = fs.create(new Path(inDir, "part-" + 0));
  file.writeBytes("testStatusLimit");
  file.close();

  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }

  Job job = Job.getInstance(conf, "testStatusLimit");

  job.setMapperClass(StatusLimitMapper.class);
  job.setNumReduceTasks(0);

  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);

  job.waitForCompletion(true);

  assertTrue("Job failed", job.isSuccessful());
}
 
Example 9
Source File: WETWordCount.java    From cc-warc-examples with MIT License 5 votes vote down vote up
/**
 * Builds and runs the Hadoop job.
 * @return	0 if the Hadoop job completes successfully and 1 otherwise.
 */
@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	//
	Job job = new Job(conf);
	job.setJarByClass(WETWordCount.class);
	job.setNumReduceTasks(1);
	
	String inputPath = "data/*.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/*.warc.wet.gz";
	LOG.info("Input path: " + inputPath);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	
	String outputPath = "/tmp/cc/";
	FileSystem fs = FileSystem.newInstance(conf);
	if (fs.exists(new Path(outputPath))) {
		fs.delete(new Path(outputPath), true);
	}
	FileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.setInputFormatClass(WARCFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    
    job.setMapperClass(WordCounterMap.WordCountMapper.class);
    // The reducer is quite useful in the word frequency task 
    job.setReducerClass(LongSumReducer.class);
	
    if (job.waitForCompletion(true)) {
    	return 0;
    } else {
    	return 1;
    }
}
 
Example 10
Source File: BlurInputFormat.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
public static void addTable(Job job, TableDescriptor tableDescriptor, String snapshot)
    throws IllegalArgumentException, IOException {
  String tableName = tableDescriptor.getName();
  Path path = new Path(tableDescriptor.getTableUri());
  FileInputFormat.addInputPath(job, path);
  putPathToTable(job.getConfiguration(), tableName, path);
  putSnapshotForTable(job.getConfiguration(), tableName, snapshot);
}
 
Example 11
Source File: FlowPartition.java    From MapReduce-Demo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
	// 设置hdfs配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");

	// 设置作业Job配置信息
	String jobName = "FlowPartition";
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(FlowPartition.class);
	job.setJar("export\\FlowPartition.jar");
	// Map
	 job.setMapperClass(FlowPartitionMapper.class);
	// Reduce
	job.setReducerClass(FlowPartitionReducer.class);
	// 输出k-v类型
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(FlowWritable.class);
	// 设置分区类,及Reducer数目
	job.setPartitionerClass(PhoneNumberPartitioner.class);
	job.setNumReduceTasks(4);

	// 设置job输入出路径
	String dataDir = "/workspace/flowStatistics/data";
	String outputDir = "/workspace/flowStatistics/output_partitions";
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if (fs.exists(outPath)) {
		fs.delete(outPath, true);
	}

	// 运行作业
	System.out.println("Job: " + jobName + " is running...");
	if (job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}
 
Example 12
Source File: DateSort2.java    From MapReduce-Demo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {		
		//1.设置HDFS配置信息
		String namenode_ip = "192.168.17.10";
		String hdfs = "hdfs://" + namenode_ip + ":9000";			
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", hdfs);
		conf.set("mapreduce.app-submission.cross-platform", "true");

		//2.设置MapReduce作业配置信息
		String jobName = "DateSort2";					//定义作业名称
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(DateSort2.class);				//指定作业类
		job.setJar("export\\DateSort2.jar");			//指定本地jar包
//		Map
		job.setMapperClass(DateSort2Mapper.class);		//指定Mapper类
		job.setMapOutputKeyClass(IntWritable.class);	//设置Mapper输出Key类型
		job.setMapOutputValueClass(Text.class);			//设置Mapper输出Value类型
//		Reduce
		job.setReducerClass(DateSort2Reducer.class);	//指定Reducer类
		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
		job.setOutputValueClass(IntWritable.class);		//设置Reduce输出Value类型
//		自定义Sort
		job.setSortComparatorClass(MySort.class);		//设置自定义排序类
		
		//3.设置作业输入和输出路径
		String dataDir = "/expr/datecount/output/part-r-00000";	//实验数据目录	
		String outputDir = "/expr/datecount/output_sort2";				//实验输出目录
		Path inPath = new Path(hdfs + dataDir);
		Path outPath = new Path(hdfs + outputDir);
		FileInputFormat.addInputPath(job, inPath);
		FileOutputFormat.setOutputPath(job, outPath);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outPath)) {
			fs.delete(outPath, true);
		}
		
		//4.运行作业
		System.out.println("Job: " + jobName + " is running...");
		if(job.waitForCompletion(true)) {
			System.out.println("success!");
			System.exit(0);
		} else {
			System.out.println("failed!");
			System.exit(1);
		}
	}
 
Example 13
Source File: MapReduceRunner.java    From halvade with GNU General Public License v3.0 4 votes vote down vote up
protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    HalvadeConf.setIsPass2(pass1Conf, false);
    HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, halvadeOpts.nodes == 1, halvadeOpts.useBamInput);
    int pass2Reduces = HalvadeResourceManager.getPass2Reduces(halvadeOpts);
    halvadeOpts.splitChromosomes(pass1Conf, pass2Reduces);
    HalvadeConf.setPass2Suffix(pass1Conf, pass2suffix);
    
    Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline");
    pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    // set pass 2 suffix so only this job finds it!
    FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf);
    try {
        if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) {
            // add every file in directory
            FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in));
            for(FileStatus file : files) {
                if (!file.isDirectory()) {
                    FileInputFormat.addInputPath(pass1Job, file.getPath());
                }
            }
        } else {
            FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in));
        }
    } catch (IOException | IllegalArgumentException e) {
        Logger.EXCEPTION(e);
    }

    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf);
    boolean skipPass1 = false;
    if (outFs.exists(new Path(tmpOutDir))) {
        // check if genome already exists
        skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS"));
        if(skipPass1)
            Logger.DEBUG("pass1 genome already created, skipping pass 1");
        else {
            Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
            Logger.INFO("ERROR: Please remove this directory before trying again.");
            System.exit(-2);
        }
    }
    if(!skipPass1) {
        FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir));
        pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);

        pass1Job.setInputFormatClass(HalvadeTextInputFormat.class);
        pass1Job.setMapOutputKeyClass(GenomeSJ.class);
        pass1Job.setMapOutputValueClass(Text.class);

        pass1Job.setSortComparatorClass(GenomeSJSortComparator.class);
        pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class);
        pass1Job.setNumReduceTasks(1); 
        pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class);          
        pass1Job.setOutputKeyClass(LongWritable.class);
        pass1Job.setOutputValueClass(Text.class);

        return runTimedJob(pass1Job, "Halvade pass 1 Job");
    } else
        return 0;
}
 
Example 14
Source File: BlurOutputFormatTest.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Test
public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);
  writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1");
  writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(1);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(tablePath, 1);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  Path path = new Path(output, ShardUtil.getShardName(0));
  dump(path, _conf);
  Collection<Path> commitedTasks = getCommitedTasks(path);
  assertEquals(1, commitedTasks.size());
  DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
  assertEquals(2, reader.numDocs());
  reader.close();
}
 
Example 15
Source File: KeyValueInput.java    From MapReduce-Demo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {		
	//1.设置HDFS配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";			
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");
	conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ":");	//设置输入文件kv分隔符
	
	//2.设置MapReduce作业配置信息
	String jobName = "KeyValueInput";					//作业名称
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(KeyValueInput.class);				//指定运行时作业类
	job.setJar("export\\KeyValueInput.jar");			//指定本地jar包
	job.setMapperClass(KeyValueInputMapper.class);		//指定Mapper类
	job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
	job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
	job.setReducerClass(KeyValueInputReducer.class);	//指定Reducer类
	job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
	job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
	
	job.setInputFormatClass(KeyValueTextInputFormat.class);	//设置输入格式化类
	
	//3.设置作业输入和输出路径
	String dataDir = "/expr/kvinput/data";			//实验数据目录	
	String outputDir = "/expr/kvinput/output";		//实验输出目录
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if(fs.exists(outPath)) {
		fs.delete(outPath, true);
	}
	
	//4.运行作业
	System.out.println("Job: " + jobName + " is running...");
	if(job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}
 
Example 16
Source File: AggregationPhaseJob.java    From incubator-pinot with Apache License 2.0 4 votes vote down vote up
public Job run() throws Exception {
  Job job = Job.getInstance(getConf());
  job.setJobName(name);
  job.setJarByClass(AggregationPhaseJob.class);

  FileSystem fs = FileSystem.get(getConf());
  Configuration configuration = job.getConfiguration();

  // Properties
  LOGGER.info("Properties {}", props);

   // Input Path
  String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH);
  LOGGER.info("Input path dir: " + inputPathDir);
  for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
    LOGGER.info("Adding input:" + inputPath);
    Path input = new Path(inputPath);
    FileInputFormat.addInputPath(job, input);
  }

  // Output path
  Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH));
  LOGGER.info("Output path dir: " + outputPath.toString());
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }
  FileOutputFormat.setOutputPath(job, outputPath);

  // Schema
  Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
  LOGGER.info("Schema : {}", avroSchema.toString(true));
  job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString());

  // ThirdEyeConfig
  String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty);
  String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
  ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
  LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
  job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

  // Map config
  job.setMapperClass(AggregationMapper.class);
  job.setInputFormatClass(AvroKeyInputFormat.class);
  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);

  // Reduce config
  job.setReducerClass(AggregationReducer.class);
  job.setOutputKeyClass(AvroKey.class);
  job.setOutputValueClass(NullWritable.class);
  AvroJob.setOutputKeySchema(job, avroSchema);
  job.setOutputFormatClass(AvroKeyOutputFormat.class);
  String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName());
  LOGGER.info("Num Reducers : {}", numReducers);
  if (StringUtils.isNotBlank(numReducers)) {
    job.setNumReduceTasks(Integer.valueOf(numReducers));
    LOGGER.info("Setting num reducers {}", job.getNumReduceTasks());
  }

  job.waitForCompletion(true);

  Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS);
  LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
  if (counter.getValue() == 0) {
    throw new IllegalStateException("No input records in " + inputPathDir);
  }
  counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED);
  LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());

  for (String metric : thirdeyeConfig.getMetricNames()) {
    counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
  }

  return job;
}
 
Example 17
Source File: ConfigurableHDFSFileSource.java    From components with Apache License 2.0 4 votes vote down vote up
private FileInputFormat<?, ?> createFormat(Job job) throws IOException, IllegalAccessException, InstantiationException {
    Path path = new Path(filepattern);
    FileInputFormat.addInputPath(job, path);
    return formatClass.newInstance();
}
 
Example 18
Source File: Missed.java    From MapReduce-Demo with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {		
	//1.设置HDFS配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";			
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");

	//2.设置MapReduce作业配置信息
	String jobName = "Missed";						//作业名称
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(Missed.class);				//指定运行时作业类
	job.setJar("export\\Missed.jar");				//指定本地jar包
	job.setMapperClass(MissedMapper.class);			//指定Mapper类
	job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
	job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
	job.setReducerClass(MissedReducer.class);		//指定Reducer类		
	//定义多文件输出的文件名、输出格式、键类型、值类型
	MultipleOutputs.addNamedOutput(job, "missed", TextOutputFormat.class, Text.class, NullWritable.class);
	
	//3.设置作业输入和输出路径
	String dataDir = "/expr/weblog/data";			//实验数据目录	
	String outputDir = "/expr/weblog/output2";		//实验输出目录
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if(fs.exists(outPath)) {
		fs.delete(outPath, true);
	}
	
	//4.运行作业
	System.out.println("Job: " + jobName + " is running...");
	if(job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}
 
Example 19
Source File: BlurOutputFormatTest.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
public void testBlurOutputFormatCleanupDuringJobKillTest() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);
  // 1500 * 50 = 75,000
  writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
  // 100 * 5000 = 500,000
  writeRecordsFile(new Path(input, "part2"), 1, 5000, 2000, 100, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(2);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(getOutDir(), 2);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);
  BlurOutputFormat.setIndexLocally(job, false);

  job.submit();
  boolean killCalled = false;
  while (!job.isComplete()) {
    Thread.sleep(1000);
    System.out.printf("Killed [" + killCalled + "] Map [%f] Reduce [%f]%n", job.mapProgress() * 100,
        job.reduceProgress() * 100);
    if (job.reduceProgress() > 0.7 && !killCalled) {
      job.killJob();
      killCalled = true;
    }
  }

  assertFalse(job.isSuccessful());

  for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
    Path path = new Path(output, ShardUtil.getShardName(i));
    FileSystem fileSystem = path.getFileSystem(job.getConfiguration());
    FileStatus[] listStatus = fileSystem.listStatus(path);
    assertEquals(toString(listStatus), 0, listStatus.length);
  }
}
 
Example 20
Source File: IndexerJobDriver.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
private boolean runAutomatic(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table,
    Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot)
    throws ClassNotFoundException, IOException, InterruptedException {
  PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot,
      fileCache);

  Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");

  InputSplitPruneUtil.setBlurLookupRowIdFromNewDataCounts(job, table, result._rowIdsFromNewData);
  InputSplitPruneUtil.setBlurLookupRowIdUpdateFromNewDataCounts(job, table, result._rowIdsToUpdateFromNewData);
  InputSplitPruneUtil.setBlurLookupRowIdFromIndexCounts(job, table, result._rowIdsFromIndex);
  InputSplitPruneUtil.setTable(job, table);

  BlurInputFormat.setLocalCachePath(job, fileCache);

  // Existing data - This adds the copy data files first open and stream
  // through all documents.
  {
    Path tablePath = new Path(descriptor.getTableUri());
    BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
    MultipleInputs.addInputPath(job, tablePath, PrunedBlurInputFormat.class, ExistingDataMapper.class);
  }

  // Existing data - This adds the row id lookup
  {
    ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT);
    FileInputFormat.addInputPath(job, result._partitionedInputData);
    MultipleInputs.addInputPath(job, result._partitionedInputData, PrunedSequenceFileInputFormat.class,
        ExistingDataIndexLookupMapper.class);
  }

  // New Data
  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
    MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
  }

  BlurOutputFormat.setOutputPath(job, outputPath);
  BlurOutputFormat.setupJob(job, descriptor);

  job.setReducerClass(UpdateReducer.class);
  job.setMapOutputKeyClass(IndexKey.class);
  job.setMapOutputValueClass(IndexValue.class);
  job.setPartitionerClass(IndexKeyPartitioner.class);
  job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

  BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

  boolean success = job.waitForCompletion(true);
  Counters counters = job.getCounters();
  LOG.info("Counters [" + counters + "]");
  return success;
}