org.apache.hadoop.mapreduce.lib.input.TextInputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.TextInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMapReduceLazyOutput.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}
 
Example #2
Source File: MRWordCount21.java    From hadoop-book with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    System.out.println("Running MR: MRWordCount21");
    Job job = new Job(getConf());
    job.setJarByClass(MRWordCount21.class);
    job.setJobName("MRWordCount21");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map21.class);
    job.setCombinerClass(Reduce21.class);
    job.setReducerClass(Reduce21.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    System.out.println("Input path: " + args[0]);
    System.out.println("Output path: " + args[1]);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}
 
Example #3
Source File: BigDiffHadoop.java    From secure-data-service with Apache License 2.0 6 votes vote down vote up
public void execute(String inputPath1, String inputPath2, String outputPath) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "bigdiff");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath1));
    FileInputFormat.addInputPath(job, new Path(inputPath2));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);
}
 
Example #4
Source File: P2Q2.java    From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 6 votes vote down vote up
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q2");
	job.setJarByClass(P2Q2.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q2Map.class);
	job.setCombinerClass(P2Q2Reduce.class);
	job.setReducerClass(P2Q2Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example #5
Source File: PageRankDriver.java    From flink-perf with Apache License 2.0 6 votes vote down vote up
public static void assignInitialRanks (Configuration conf, FileSystem fs, String adjacencyPath, String initialPath, int numVertices) throws Exception {
	Path seqFile = new Path (initialPath);
	if (fs.exists(seqFile)) {
		fs.delete(seqFile, true);
	}
	Job job = Job.getInstance(conf);
	job.setJarByClass(InitialRankAssigner.class);
	job.setMapperClass(InitialRankAssigner.class);
	job.setReducerClass(Reducer.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(LongWritable.class);
	job.setMapOutputValueClass(Message.class);
	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(Message.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	job.setInputFormatClass(TextInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(adjacencyPath));
	FileOutputFormat.setOutputPath(job, seqFile);
	job.waitForCompletion(true);
}
 
Example #6
Source File: P2Q3.java    From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 6 votes vote down vote up
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q3");
	job.setJarByClass(P2Q3.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q3Map.class);
	job.setCombinerClass(P2Q3Reduce.class);
	job.setReducerClass(P2Q3Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example #7
Source File: P1Q2.java    From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 6 votes vote down vote up
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P1Q2");
	job.setJarByClass(P1Q2.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(DoubleWritable.class);

	job.setMapperClass(P1Q2Map.class);
	job.setCombinerClass(P1Q2Reduce.class);
	job.setReducerClass(P1Q2Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example #8
Source File: UserNamePermission.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
  Path outDir = new Path("output");
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf, "user name check"); 
	
	
  job.setJarByClass(UserNamePermission.class);
  job.setMapperClass(UserNamePermission.UserNameMapper.class);
  job.setCombinerClass(UserNamePermission.UserNameReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
  job.setReducerClass(UserNamePermission.UserNameReducer.class);
  job.setNumReduceTasks(1);
    
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path("input"));
  FileOutputFormat.setOutputPath(job, outDir);
    
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example #9
Source File: FieldSelectionMapper.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public void setup(Context context) 
    throws IOException, InterruptedException {
  Configuration conf = context.getConfiguration();
  this.fieldSeparator = 
    conf.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t");
  this.mapOutputKeyValueSpec = 
    conf.get(FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:");
  try {
    this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals(
      context.getInputFormatClass().getCanonicalName());
  } catch (ClassNotFoundException e) {
    throw new IOException("Input format class not found", e);
  }
  allMapValueFieldsFrom = FieldSelectionHelper.parseOutputKeyValueSpec(
    mapOutputKeyValueSpec, mapOutputKeyFieldList, mapOutputValueFieldList);
  LOG.info(FieldSelectionHelper.specToString(fieldSeparator,
    mapOutputKeyValueSpec, allMapValueFieldsFrom, mapOutputKeyFieldList,
    mapOutputValueFieldList) + "\nignoreInputKey:" + ignoreInputKey);
}
 
Example #10
Source File: TestSpecificInputOutputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Before
public void createParquetFile() throws Exception {
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    final Job job = new Job(conf, "write");

    // input not really used
    TextInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TestSpecificInputOutputFormat.MyMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setOutputPath(job, parquetPath);
    AvroParquetOutputFormat.setSchema(job, Car.SCHEMA$);

    waitForJob(job);
  }
}
 
Example #11
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void write(final Configuration conf, final Path inputPath,
    final Path parquetPath, Class<? extends Mapper> mapperClass, Class<? extends TBase<?, ?>> outputClass) throws IOException, Exception {
  final Job job = new Job(conf, "write");

  // input not really used
  TextInputFormat.addInputPath(job, inputPath);
  job.setInputFormatClass(TextInputFormat.class);

  job.setMapperClass(mapperClass);
  job.setNumReduceTasks(0);

  job.setOutputFormatClass(ParquetThriftOutputFormat.class);
  ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.GZIP);
  ParquetThriftOutputFormat.setOutputPath(job, parquetPath);
  ParquetThriftOutputFormat.setThriftClass(job, outputClass);

  waitForJob(job);
}
 
Example #12
Source File: TestFileSystemInput.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void readInputFormat() throws Exception {
  Map<String, Object> paramMap = new HashMap<>();
  paramMap.put(FileSystemInput.FORMAT_CONFIG, "input-format");
  paramMap.put(FileSystemInput.PATH_CONFIG, FileSystemInput.class.getResource(CSV_DATA).getPath());
  paramMap.put(FileSystemInput.INPUT_FORMAT_TYPE_CONFIG, TextInputFormat.class.getCanonicalName());
  paramMap.put("translator" + "." + ComponentFactory.TYPE_CONFIG_NAME,
      DummyInputFormatTranslator.class.getCanonicalName());
  config = ConfigFactory.parseMap(paramMap);

  FileSystemInput formatInput = new FileSystemInput();
  assertNoValidationFailures(formatInput, config);
  formatInput.configure(config);

  Dataset<Row> results = formatInput.read();

  assertEquals("Invalid number of rows", 4, results.count());
  assertEquals("Invalid first row result", 0L, results.first().getLong(0));
  assertEquals("Invalid first row result", "One,Two,Three,Four", results.first().getString(1));
}
 
Example #13
Source File: P2Q1.java    From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 6 votes vote down vote up
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P2Q1");
	job.setJarByClass(P2Q1.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setMapperClass(P2Q1Map.class);
	job.setCombinerClass(P2Q1Reduce.class);
	job.setReducerClass(P2Q1Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example #14
Source File: FailJob.java    From big-c with Apache License 2.0 6 votes vote down vote up
public Job createJob(boolean failMappers, boolean failReducers, Path inputFile) 
    throws IOException {
  Configuration conf = getConf();
  conf.setBoolean(FAIL_MAP, failMappers);
  conf.setBoolean(FAIL_REDUCE, failReducers);
  Job job = Job.getInstance(conf, "fail");
  job.setJarByClass(FailJob.class);
  job.setMapperClass(FailMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(FailReducer.class);
  job.setOutputFormatClass(NullOutputFormat.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setSpeculativeExecution(false);
  job.setJobName("Fail job");
  FileInputFormat.addInputPath(job, inputFile);
  return job;
}
 
Example #15
Source File: TestMapReduceLazyOutput.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static void runTestLazyOutput(Configuration conf, Path output,
    int numReducers, boolean createLazily) 
throws Exception {
  Job job = Job.getInstance(conf, "Test-Lazy-Output");

  FileInputFormat.setInputPaths(job, INPUT);
  FileOutputFormat.setOutputPath(job, output);

  job.setJarByClass(TestMapReduceLazyOutput.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks(numReducers);

  job.setMapperClass(TestMapper.class);
  job.setReducerClass(TestReducer.class);

  if (createLazily) {
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
  } else {
    job.setOutputFormatClass(TextOutputFormat.class);
  }
  assertTrue(job.waitForCompletion(true));
}
 
Example #16
Source File: P1Q3.java    From IntroToHadoopAndMR__Udacity_Course with Apache License 2.0 6 votes vote down vote up
public final static void main(final String[] args) throws Exception {
	final Configuration conf = new Configuration();

	final Job job = new Job(conf, "P1Q3");
	job.setJarByClass(P1Q3.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(DoubleWritable.class);

	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(DoubleWritable.class);

	job.setMapperClass(P1Q3Map.class);
	//job.setCombinerClass(P1Q3Reduce.class);
	job.setReducerClass(P1Q3Reduce.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path(args[0]));
	FileOutputFormat.setOutputPath(job, new Path(args[1]));

	job.waitForCompletion(true);
}
 
Example #17
Source File: ReplicatedUserJoin.java    From hadoop-map-reduce-patterns with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 4) {
		printUsage();
	}
	Job job = new Job(conf, "ReduceSideJoin");
	job.setJarByClass(ReplicatedUserJoin.class);

	// Use MultipleInputs to set which input uses what mapper
	// This will keep parsing of each data set separate from a logical
	// standpoint
	// The first two elements of the args array are the two inputs
	MultipleInputs.addInputPath(job, new Path(args[0]),
			TextInputFormat.class, UserJoinMapper.class);
	MultipleInputs.addInputPath(job, new Path(args[1]),
			TextInputFormat.class, CommentJoinMapper.class);
	job.getConfiguration().set("join.type", args[2]);

	job.setReducerClass(UserJoinReducer.class);

	job.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(job, new Path(args[3]));

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);

	return job.waitForCompletion(true) ? 0 : 2;
}
 
Example #18
Source File: SamplerJob.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(SamplerJob.class);

  ReservoirSamplerInputFormat.setInputFormat(job,
      TextInputFormat.class);

  ReservoirSamplerInputFormat.setNumSamples(job, 10);
  ReservoirSamplerInputFormat.setMaxRecordsToRead(job, 10000);
  ReservoirSamplerInputFormat.
      setUseSamplesNumberPerInputSplit(job, true);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  outputPath.getFileSystem(conf).delete(outputPath, true);

  if (job.waitForCompletion(true)) {
    return 0;
  }
  return 1;
}
 
Example #19
Source File: ClusterHdfsSource.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@VisibleForTesting
List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
    throws IOException, InterruptedException {
  int previewCount = previewBuffer.size();
  TextInputFormat textInputFormat = new TextInputFormat();
  long fileLength = fileStatus.getLen();
  List<Map.Entry> batch = new ArrayList<>();
  Path filePath = fileStatus.getPath();
  // MR allows file length to be 0 for text data (not for avro)
  if (fileLength == 0) {
    LOG.info("File length is 0 for {}", filePath);
    return batch;
  }
  // Hadoop does unsafe casting from long to int, so split length should not be greater than int
  // max value
  long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength: Integer.MAX_VALUE;
  InputSplit fileSplit = new FileSplit(filePath, 0, splitLength, null);
  TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf,
      TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));

  try (RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext)) {
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean hasNext = recordReader.nextKeyValue();

    while (hasNext && batch.size() < batchSize && previewCount < batchSize) {
      batch.add(new Pair(filePath.toUri().getPath() + "::" + recordReader.getCurrentKey(),
          String.valueOf(recordReader.getCurrentValue())));
      hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
      previewCount++;
    }
  }
  return batch;
}
 
Example #20
Source File: JMatrixMultiplicationStep1.java    From RecommendationEngine with MIT License 5 votes vote down vote up
public static void run() throws IOException, ClassNotFoundException,
        InterruptedException {
    String inputPath1 = ItemBasedCFDriver.path.get("step7InputPath1");
    String inputPath2 = ItemBasedCFDriver.path.get("step7InputPath2");
    String outputPath = ItemBasedCFDriver.path.get("step7OutputPath");

    Configuration conf = new Configuration();
    conf.set("mapred.textoutputformat.separator", ",");

    Job job = Job.getInstance(conf);

    HDFS hdfs = new HDFS(conf);
    hdfs.rmr(outputPath);

    job.setMapperClass(Step1_Mapper.class);
    job.setJarByClass(JMatrixMultiplicationStep1.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputPath1), new Path(
            inputPath2));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.waitForCompletion(true);

}
 
Example #21
Source File: CalculateSimilarityStep2.java    From RecommendationEngine with MIT License 5 votes vote down vote up
public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {

	String inputPath = ItemBasedCFDriver.path.get("step2InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step2OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step2_Mapper.class);
	job.setReducerClass(Step2_Reducer.class);
	job.setCombinerClass(Step2_Reducer.class);
	job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber);

	job.setJarByClass(CalculateSimilarityStep2.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(IntWritable.class);
	job.setOutputValueClass(Text.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}
 
Example #22
Source File: LouvainRunner.java    From distributed-graph-analytics with Apache License 2.0 5 votes vote down vote up
private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}
 
Example #23
Source File: JobContextImpl.java    From tez with Apache License 2.0 5 votes vote down vote up
/**
 * Get the {@link InputFormat} class for the job.
 * 
 * @return the {@link InputFormat} class for the job.
 */
@SuppressWarnings("unchecked")
public Class<? extends InputFormat<?,?>> getInputFormatClass() 
   throws ClassNotFoundException {
  return (Class<? extends InputFormat<?,?>>) 
    conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
}
 
Example #24
Source File: CommunityCompression.java    From distributed-graph-analytics with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
    Configuration mrConf = this.getConf();
    for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(mrConf);
    job.setJarByClass(CommunityCompression.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}
 
Example #25
Source File: WordCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	Job job = Job.getInstance();
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
	TextInputFormat.addInputPath(job, new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	// Tokenize the line and convert from Writable "Text" to String for better handling
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

	// Sum up the words
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	// Convert String back to Writable "Text" for use with Hadoop Output Format
	DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
	hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
	hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
	TextOutputFormat.setOutputPath(job, new Path(outputPath));

	// Output & Execute
	hadoopResult.output(hadoopOutputFormat);
	env.execute("Word Count");
}
 
Example #26
Source File: TestMRHelpers.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private InputSplitInfo generateNewSplits(Path inputSplitsDir)
    throws Exception {
  JobConf jobConf = new JobConf();
  jobConf.setUseNewMapper(true);
  jobConf.setClass(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class,
      InputFormat.class);
  jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString());

  return MRHelpers.generateInputSplits(jobConf, inputSplitsDir);
}
 
Example #27
Source File: WordCount.java    From accumulo-examples with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  Opts opts = new Opts();
  opts.parseArgs(WordCount.class.getName(), args);

  // Create Accumulo table and attach Summing iterator
  try (AccumuloClient client = opts.createAccumuloClient()) {
    client.tableOperations().create(opts.tableName);
    IteratorSetting is = new IteratorSetting(10, SummingCombiner.class);
    SummingCombiner.setColumns(is,
        Collections.singletonList(new IteratorSetting.Column("count")));
    SummingCombiner.setEncodingType(is, SummingCombiner.Type.STRING);
    client.tableOperations().attachIterator(opts.tableName, is);
  } catch (TableExistsException e) {
    // ignore
  }

  // Create M/R job
  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(WordCount.class.getName());
  job.setJarByClass(WordCount.class);
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.setInputPaths(job, new Path(opts.inputDirectory));

  job.setMapperClass(MapClass.class);
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(AccumuloOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Mutation.class);

  if (opts.hdfsPath != null) {
    AccumuloOutputFormat.configure().clientPropertiesPath(opts.hdfsPath)
        .defaultTable(opts.tableName).store(job);
  } else {
    AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties())
        .defaultTable(opts.tableName).store(job);
  }
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example #28
Source File: Phase3Step2DistinctDataJob.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example #29
Source File: CalculateSimilarityStep5.java    From RecommendationEngine with MIT License 5 votes vote down vote up
public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step5InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step5OutputPath");

	Configuration conf = new Configuration();
	conf.set("mapred.textoutputformat.separator", ":");

	Job job = Job.getInstance(conf);
	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step5_Mapper.class);

	job.setJarByClass(CalculateSimilarityStep5.class);

	job.setMapOutputKeyClass(IntWritable.class);
	job.setMapOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}
 
Example #30
Source File: TestMRInputHelpers.java    From tez with Apache License 2.0 5 votes vote down vote up
private DataSourceDescriptor generateDataSourceDescriptorMapReduce(Path inputSplitsDir)
    throws Exception {
  JobConf jobConf = new JobConf(dfsCluster.getFileSystem().getConf());
  jobConf.setUseNewMapper(true);
  jobConf.setClass(org.apache.hadoop.mapreduce.MRJobConfig.INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class,
      InputFormat.class);
  jobConf.set(TextInputFormat.INPUT_DIR, testFilePath.toString());

  return MRInputHelpers.configureMRInputWithLegacySplitGeneration(jobConf, inputSplitsDir, true);
}