Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPath()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: WordCount.java From wifi with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//		System.out.println(otherArgs);
		if(otherArgs.length != 2) {
			System.out.println("Usage:wordcount <in> <out>");
			System.exit(2);
		}
//		if(args.length != 2) {
//			System.out.println("param error!");
//			System.exit(-1);
//		}
		
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(TokenizerMapper.class);
		job.setCombinerClass(IntSumReducer.class);
		job.setReducerClass(IntSumReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
		
	}

Example 2

Source File: MedianStdDev.java From hadoop-map-reduce-patterns with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: MedianStdDev <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	Job job = new Job(conf,
			"StackOverflow Median and Standard Deviation Comment Length By Hour");
	job.setJarByClass(MedianStdDev.class);
	job.setMapperClass(MedianStdDevMapper.class);
	job.setReducerClass(MedianStdDevReducer.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(IntWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);

	return success ? 0 : 1;
}

Example 3

Source File: WordCount.java From knox with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
  if (otherArgs.length != 2) {
    System.err.println( "Usage: wordcount <in> <out>" );
    System.exit(2);
  }
  Job job = Job.getInstance(conf, "Word Count");
  job.setJarByClass(WordCount.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Example 4

Source File: CredentialsTestJob.java From hadoop with Apache License 2.0

6 votes

public Job createJob() 
throws IOException {
  Configuration conf = getConf();
  conf.setInt(MRJobConfig.NUM_MAPS, 1);
  Job job = Job.getInstance(conf, "test");
  job.setNumReduceTasks(1);
  job.setJarByClass(CredentialsTestJob.class);
  job.setNumReduceTasks(1);
  job.setMapperClass(CredentialsTestJob.CredentialsTestMapper.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(CredentialsTestJob.CredentialsTestReducer.class);
  job.setInputFormatClass(SleepJob.SleepInputFormat.class);
  job.setPartitionerClass(SleepJob.SleepJobPartitioner.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("test job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  return job;
}

Example 5

Source File: WordMean.java From big-c with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("Usage: wordmean <in> <out>");
    return 0;
  }

  Configuration conf = getConf();

  Job job = Job.getInstance(conf, "word mean");
  job.setJarByClass(WordMean.class);
  job.setMapperClass(WordMeanMapper.class);
  job.setCombinerClass(WordMeanReducer.class);
  job.setReducerClass(WordMeanReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  Path outputpath = new Path(args[1]);
  FileOutputFormat.setOutputPath(job, outputpath);
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);

  Boolean waitforCompletion = job.waitForCompletion(true) ;

  Date end_time = new Date();
  System.out.println("Job ended: " + end_time);
  System.out.println("The job took " +
      (end_time.getTime() - startTime.getTime()) /1000 + " seconds.");
 
  

  return (waitforCompletion? 0 : 1);
}

Example 6

Source File: TestLocalModeWithNewApis.java From hadoop with Apache License 2.0

5 votes

@Test
public void testNewApis() throws Exception {
  Random r = new Random(System.currentTimeMillis());
  Path tmpBaseDir = new Path("/tmp/wc-" + r.nextInt());
  final Path inDir = new Path(tmpBaseDir, "input");
  final Path outDir = new Path(tmpBaseDir, "output");
  String input = "The quick brown fox\nhas many silly\nred fox sox\n";
  FileSystem inFs = inDir.getFileSystem(conf);
  FileSystem outFs = outDir.getFileSystem(conf);
  outFs.delete(outDir, true);
  if (!inFs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  {
    DataOutputStream file = inFs.create(new Path(inDir, "part-0"));
    file.writeBytes(input);
    file.close();
  }

  Job job = Job.getInstance(conf, "word count");
  job.setJarByClass(TestLocalModeWithNewApis.class);
  job.setMapperClass(TokenizerMapper.class);
  job.setCombinerClass(IntSumReducer.class);
  job.setReducerClass(IntSumReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);
  assertEquals(job.waitForCompletion(true), true);

  String output = readOutput(outDir, conf);
  assertEquals("The\t1\nbrown\t1\nfox\t2\nhas\t1\nmany\t1\n" +
               "quick\t1\nred\t1\nsilly\t1\nsox\t1\n", output);
  
  outFs.delete(tmpBaseDir, true);
}

Example 7

Source File: TestMapperReducerCleanup.java From big-c with Apache License 2.0

5 votes

@Test
public void testMapCleanup() throws Exception {
  reset();
  
  Job job = Job.getInstance();

  Path inputPath = createInput();
  Path outputPath = getOutputPath();

  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.getLocal(conf);

  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }

  job.setMapperClass(FailingMapper.class);
  job.setInputFormatClass(TrackingTextInputFormat.class);
  job.setOutputFormatClass(TrackingTextOutputFormat.class);
  job.setNumReduceTasks(0);
  FileInputFormat.addInputPath(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.waitForCompletion(true);

  Assert.assertTrue(mapCleanup);
  Assert.assertTrue(recordReaderCleanup);
  Assert.assertTrue(recordWriterCleanup);
}

Example 8

Source File: TestReporter.java From hadoop with Apache License 2.0

5 votes

@Test
public void testStatusLimit() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path test = new Path(testRootTempDir, "testStatusLimit");

  Configuration conf = new Configuration();
  Path inDir = new Path(test, "in");
  Path outDir = new Path(test, "out");
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(inDir)) {
    fs.delete(inDir, true);
  }
  fs.mkdirs(inDir);
  DataOutputStream file = fs.create(new Path(inDir, "part-" + 0));
  file.writeBytes("testStatusLimit");
  file.close();

  if (fs.exists(outDir)) {
    fs.delete(outDir, true);
  }

  Job job = Job.getInstance(conf, "testStatusLimit");

  job.setMapperClass(StatusLimitMapper.class);
  job.setNumReduceTasks(0);

  FileInputFormat.addInputPath(job, inDir);
  FileOutputFormat.setOutputPath(job, outDir);

  job.waitForCompletion(true);

  assertTrue("Job failed", job.isSuccessful());
}

Example 9

Source File: WETWordCount.java From cc-warc-examples with MIT License

5 votes

/**
 * Builds and runs the Hadoop job.
 * @return	0 if the Hadoop job completes successfully and 1 otherwise.
 */
@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	//
	Job job = new Job(conf);
	job.setJarByClass(WETWordCount.class);
	job.setNumReduceTasks(1);
	
	String inputPath = "data/*.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/*.warc.wet.gz";
	LOG.info("Input path: " + inputPath);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	
	String outputPath = "/tmp/cc/";
	FileSystem fs = FileSystem.newInstance(conf);
	if (fs.exists(new Path(outputPath))) {
		fs.delete(new Path(outputPath), true);
	}
	FileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.setInputFormatClass(WARCFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    
    job.setMapperClass(WordCounterMap.WordCountMapper.class);
    // The reducer is quite useful in the word frequency task 
    job.setReducerClass(LongSumReducer.class);
	
    if (job.waitForCompletion(true)) {
    	return 0;
    } else {
    	return 1;
    }
}

Example 10

Source File: BlurInputFormat.java From incubator-retired-blur with Apache License 2.0

5 votes

public static void addTable(Job job, TableDescriptor tableDescriptor, String snapshot)
    throws IllegalArgumentException, IOException {
  String tableName = tableDescriptor.getName();
  Path path = new Path(tableDescriptor.getTableUri());
  FileInputFormat.addInputPath(job, path);
  putPathToTable(job.getConfiguration(), tableName, path);
  putSnapshotForTable(job.getConfiguration(), tableName, snapshot);
}

Example 11

Source File: FlowPartition.java From MapReduce-Demo with MIT License

4 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
	// 设置hdfs配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");

	// 设置作业Job配置信息
	String jobName = "FlowPartition";
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(FlowPartition.class);
	job.setJar("export\\FlowPartition.jar");
	// Map
	 job.setMapperClass(FlowPartitionMapper.class);
	// Reduce
	job.setReducerClass(FlowPartitionReducer.class);
	// 输出k-v类型
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(FlowWritable.class);
	// 设置分区类,及Reducer数目
	job.setPartitionerClass(PhoneNumberPartitioner.class);
	job.setNumReduceTasks(4);

	// 设置job输入出路径
	String dataDir = "/workspace/flowStatistics/data";
	String outputDir = "/workspace/flowStatistics/output_partitions";
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if (fs.exists(outPath)) {
		fs.delete(outPath, true);
	}

	// 运行作业
	System.out.println("Job: " + jobName + " is running...");
	if (job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}

Example 12

Source File: DateSort2.java From MapReduce-Demo with MIT License

4 votes

public static void main(String[] args) throws Exception {		
		//1.设置HDFS配置信息
		String namenode_ip = "192.168.17.10";
		String hdfs = "hdfs://" + namenode_ip + ":9000";			
		Configuration conf = new Configuration();
		conf.set("fs.defaultFS", hdfs);
		conf.set("mapreduce.app-submission.cross-platform", "true");

		//2.设置MapReduce作业配置信息
		String jobName = "DateSort2";					//定义作业名称
		Job job = Job.getInstance(conf, jobName);
		job.setJarByClass(DateSort2.class);				//指定作业类
		job.setJar("export\\DateSort2.jar");			//指定本地jar包
//		Map
		job.setMapperClass(DateSort2Mapper.class);		//指定Mapper类
		job.setMapOutputKeyClass(IntWritable.class);	//设置Mapper输出Key类型
		job.setMapOutputValueClass(Text.class);			//设置Mapper输出Value类型
//		Reduce
		job.setReducerClass(DateSort2Reducer.class);	//指定Reducer类
		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
		job.setOutputValueClass(IntWritable.class);		//设置Reduce输出Value类型
//		自定义Sort
		job.setSortComparatorClass(MySort.class);		//设置自定义排序类
		
		//3.设置作业输入和输出路径
		String dataDir = "/expr/datecount/output/part-r-00000";	//实验数据目录	
		String outputDir = "/expr/datecount/output_sort2";				//实验输出目录
		Path inPath = new Path(hdfs + dataDir);
		Path outPath = new Path(hdfs + outputDir);
		FileInputFormat.addInputPath(job, inPath);
		FileOutputFormat.setOutputPath(job, outPath);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outPath)) {
			fs.delete(outPath, true);
		}
		
		//4.运行作业
		System.out.println("Job: " + jobName + " is running...");
		if(job.waitForCompletion(true)) {
			System.out.println("success!");
			System.exit(0);
		} else {
			System.out.println("failed!");
			System.exit(1);
		}
	}

Example 13

Source File: MapReduceRunner.java From halvade with GNU General Public License v3.0

4 votes

protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    HalvadeConf.setIsPass2(pass1Conf, false);
    HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, halvadeOpts.nodes == 1, halvadeOpts.useBamInput);
    int pass2Reduces = HalvadeResourceManager.getPass2Reduces(halvadeOpts);
    halvadeOpts.splitChromosomes(pass1Conf, pass2Reduces);
    HalvadeConf.setPass2Suffix(pass1Conf, pass2suffix);
    
    Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline");
    pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    // set pass 2 suffix so only this job finds it!
    FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf);
    try {
        if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) {
            // add every file in directory
            FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in));
            for(FileStatus file : files) {
                if (!file.isDirectory()) {
                    FileInputFormat.addInputPath(pass1Job, file.getPath());
                }
            }
        } else {
            FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in));
        }
    } catch (IOException | IllegalArgumentException e) {
        Logger.EXCEPTION(e);
    }

    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf);
    boolean skipPass1 = false;
    if (outFs.exists(new Path(tmpOutDir))) {
        // check if genome already exists
        skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS"));
        if(skipPass1)
            Logger.DEBUG("pass1 genome already created, skipping pass 1");
        else {
            Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
            Logger.INFO("ERROR: Please remove this directory before trying again.");
            System.exit(-2);
        }
    }
    if(!skipPass1) {
        FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir));
        pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);

        pass1Job.setInputFormatClass(HalvadeTextInputFormat.class);
        pass1Job.setMapOutputKeyClass(GenomeSJ.class);
        pass1Job.setMapOutputValueClass(Text.class);

        pass1Job.setSortComparatorClass(GenomeSJSortComparator.class);
        pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class);
        pass1Job.setNumReduceTasks(1); 
        pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class);          
        pass1Job.setOutputKeyClass(LongWritable.class);
        pass1Job.setOutputValueClass(Text.class);

        return runTimedJob(pass1Job, "Halvade pass 1 Job");
    } else
        return 0;
}

Example 14

Source File: BlurOutputFormatTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);
  writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1");
  writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(1);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(tablePath, 1);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  Path path = new Path(output, ShardUtil.getShardName(0));
  dump(path, _conf);
  Collection<Path> commitedTasks = getCommitedTasks(path);
  assertEquals(1, commitedTasks.size());
  DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
  assertEquals(2, reader.numDocs());
  reader.close();
}

Example 15

Source File: KeyValueInput.java From MapReduce-Demo with MIT License

4 votes

public static void main(String[] args) throws Exception {		
	//1.设置HDFS配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";			
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");
	conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ":");	//设置输入文件kv分隔符
	
	//2.设置MapReduce作业配置信息
	String jobName = "KeyValueInput";					//作业名称
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(KeyValueInput.class);				//指定运行时作业类
	job.setJar("export\\KeyValueInput.jar");			//指定本地jar包
	job.setMapperClass(KeyValueInputMapper.class);		//指定Mapper类
	job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
	job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
	job.setReducerClass(KeyValueInputReducer.class);	//指定Reducer类
	job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
	job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
	
	job.setInputFormatClass(KeyValueTextInputFormat.class);	//设置输入格式化类
	
	//3.设置作业输入和输出路径
	String dataDir = "/expr/kvinput/data";			//实验数据目录	
	String outputDir = "/expr/kvinput/output";		//实验输出目录
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if(fs.exists(outPath)) {
		fs.delete(outPath, true);
	}
	
	//4.运行作业
	System.out.println("Job: " + jobName + " is running...");
	if(job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}

Example 16

Source File: AggregationPhaseJob.java From incubator-pinot with Apache License 2.0

4 votes

public Job run() throws Exception {
  Job job = Job.getInstance(getConf());
  job.setJobName(name);
  job.setJarByClass(AggregationPhaseJob.class);

  FileSystem fs = FileSystem.get(getConf());
  Configuration configuration = job.getConfiguration();

  // Properties
  LOGGER.info("Properties {}", props);

   // Input Path
  String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH);
  LOGGER.info("Input path dir: " + inputPathDir);
  for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
    LOGGER.info("Adding input:" + inputPath);
    Path input = new Path(inputPath);
    FileInputFormat.addInputPath(job, input);
  }

  // Output path
  Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH));
  LOGGER.info("Output path dir: " + outputPath.toString());
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }
  FileOutputFormat.setOutputPath(job, outputPath);

  // Schema
  Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
  LOGGER.info("Schema : {}", avroSchema.toString(true));
  job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString());

  // ThirdEyeConfig
  String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty);
  String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
  ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
  LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
  job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

  // Map config
  job.setMapperClass(AggregationMapper.class);
  job.setInputFormatClass(AvroKeyInputFormat.class);
  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);

  // Reduce config
  job.setReducerClass(AggregationReducer.class);
  job.setOutputKeyClass(AvroKey.class);
  job.setOutputValueClass(NullWritable.class);
  AvroJob.setOutputKeySchema(job, avroSchema);
  job.setOutputFormatClass(AvroKeyOutputFormat.class);
  String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName());
  LOGGER.info("Num Reducers : {}", numReducers);
  if (StringUtils.isNotBlank(numReducers)) {
    job.setNumReduceTasks(Integer.valueOf(numReducers));
    LOGGER.info("Setting num reducers {}", job.getNumReduceTasks());
  }

  job.waitForCompletion(true);

  Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS);
  LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
  if (counter.getValue() == 0) {
    throw new IllegalStateException("No input records in " + inputPathDir);
  }
  counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED);
  LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());

  for (String metric : thirdeyeConfig.getMetricNames()) {
    counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
  }

  return job;
}

Example 17

Source File: ConfigurableHDFSFileSource.java From components with Apache License 2.0

4 votes

private FileInputFormat<?, ?> createFormat(Job job) throws IOException, IllegalAccessException, InstantiationException {
    Path path = new Path(filepattern);
    FileInputFormat.addInputPath(job, path);
    return formatClass.newInstance();
}

Example 18

Source File: Missed.java From MapReduce-Demo with MIT License

4 votes

public static void main(String[] args) throws Exception {		
	//1.设置HDFS配置信息
	String namenode_ip = "192.168.17.10";
	String hdfs = "hdfs://" + namenode_ip + ":9000";			
	Configuration conf = new Configuration();
	conf.set("fs.defaultFS", hdfs);
	conf.set("mapreduce.app-submission.cross-platform", "true");

	//2.设置MapReduce作业配置信息
	String jobName = "Missed";						//作业名称
	Job job = Job.getInstance(conf, jobName);
	job.setJarByClass(Missed.class);				//指定运行时作业类
	job.setJar("export\\Missed.jar");				//指定本地jar包
	job.setMapperClass(MissedMapper.class);			//指定Mapper类
	job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
	job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
	job.setReducerClass(MissedReducer.class);		//指定Reducer类		
	//定义多文件输出的文件名、输出格式、键类型、值类型
	MultipleOutputs.addNamedOutput(job, "missed", TextOutputFormat.class, Text.class, NullWritable.class);
	
	//3.设置作业输入和输出路径
	String dataDir = "/expr/weblog/data";			//实验数据目录	
	String outputDir = "/expr/weblog/output2";		//实验输出目录
	Path inPath = new Path(hdfs + dataDir);
	Path outPath = new Path(hdfs + outputDir);
	FileInputFormat.addInputPath(job, inPath);
	FileOutputFormat.setOutputPath(job, outPath);
	FileSystem fs = FileSystem.get(conf);
	if(fs.exists(outPath)) {
		fs.delete(outPath, true);
	}
	
	//4.运行作业
	System.out.println("Job: " + jobName + " is running...");
	if(job.waitForCompletion(true)) {
		System.out.println("success!");
		System.exit(0);
	} else {
		System.out.println("failed!");
		System.exit(1);
	}
}

Example 19

Source File: BlurOutputFormatTest.java From incubator-retired-blur with Apache License 2.0

4 votes

public void testBlurOutputFormatCleanupDuringJobKillTest() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);
  // 1500 * 50 = 75,000
  writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
  // 100 * 5000 = 500,000
  writeRecordsFile(new Path(input, "part2"), 1, 5000, 2000, 100, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(2);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(getOutDir(), 2);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);
  BlurOutputFormat.setIndexLocally(job, false);

  job.submit();
  boolean killCalled = false;
  while (!job.isComplete()) {
    Thread.sleep(1000);
    System.out.printf("Killed [" + killCalled + "] Map [%f] Reduce [%f]%n", job.mapProgress() * 100,
        job.reduceProgress() * 100);
    if (job.reduceProgress() > 0.7 && !killCalled) {
      job.killJob();
      killCalled = true;
    }
  }

  assertFalse(job.isSuccessful());

  for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
    Path path = new Path(output, ShardUtil.getShardName(i));
    FileSystem fileSystem = path.getFileSystem(job.getConfiguration());
    FileStatus[] listStatus = fileSystem.listStatus(path);
    assertEquals(toString(listStatus), 0, listStatus.length);
  }
}

Example 20

Source File: IndexerJobDriver.java From incubator-retired-blur with Apache License 2.0

4 votes

private boolean runAutomatic(String uuid, TableDescriptor descriptor, List<Path> inprogressPathList, String table,
    Path fileCache, Path outputPath, int reducerMultipler, Path tmpPath, TableStats tableStats, String snapshot)
    throws ClassNotFoundException, IOException, InterruptedException {
  PartitionedInputResult result = buildPartitionedInputData(uuid, tmpPath, descriptor, inprogressPathList, snapshot,
      fileCache);

  Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");

  InputSplitPruneUtil.setBlurLookupRowIdFromNewDataCounts(job, table, result._rowIdsFromNewData);
  InputSplitPruneUtil.setBlurLookupRowIdUpdateFromNewDataCounts(job, table, result._rowIdsToUpdateFromNewData);
  InputSplitPruneUtil.setBlurLookupRowIdFromIndexCounts(job, table, result._rowIdsFromIndex);
  InputSplitPruneUtil.setTable(job, table);

  BlurInputFormat.setLocalCachePath(job, fileCache);

  // Existing data - This adds the copy data files first open and stream
  // through all documents.
  {
    Path tablePath = new Path(descriptor.getTableUri());
    BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
    MultipleInputs.addInputPath(job, tablePath, PrunedBlurInputFormat.class, ExistingDataMapper.class);
  }

  // Existing data - This adds the row id lookup
  {
    ExistingDataIndexLookupMapper.setSnapshot(job, MRUPDATE_SNAPSHOT);
    FileInputFormat.addInputPath(job, result._partitionedInputData);
    MultipleInputs.addInputPath(job, result._partitionedInputData, PrunedSequenceFileInputFormat.class,
        ExistingDataIndexLookupMapper.class);
  }

  // New Data
  for (Path p : inprogressPathList) {
    FileInputFormat.addInputPath(job, p);
    MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, NewDataMapper.class);
  }

  BlurOutputFormat.setOutputPath(job, outputPath);
  BlurOutputFormat.setupJob(job, descriptor);

  job.setReducerClass(UpdateReducer.class);
  job.setMapOutputKeyClass(IndexKey.class);
  job.setMapOutputValueClass(IndexValue.class);
  job.setPartitionerClass(IndexKeyPartitioner.class);
  job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

  BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

  boolean success = job.waitForCompletion(true);
  Counters counters = job.getCounters();
  LOG.info("Counters [" + counters + "]");
  return success;
}