org.apache.hadoop.mapred.TextOutputFormat Java Examples

The following examples show how to use org.apache.hadoop.mapred.TextOutputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PersonVersion.java    From blog with MIT License 6 votes vote down vote up
private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass,
                             Class<? extends Reducer> reduceClass) throws Exception {
    JobConf conf = new JobConf(PersonVersion.class);
    conf.setJobName(jobName);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(mapClass);
    conf.setCombinerClass(reduceClass);
    conf.setReducerClass(reduceClass);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    JobClient.runJob(conf);
}
 
Example #2
Source File: HdfsHelper.java    From DataLink with Apache License 2.0 6 votes vote down vote up
TextWriterProxy(Configuration config, String fileName) throws IOException{
	fieldDelimiter = config.getChar(Key.FIELD_DELIMITER);
       columns = config.getListConfiguration(Key.COLUMN);
       
       String compress = config.getString(Key.COMPRESS,null);
       SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmm");
       String attempt = "attempt_"+dateFormat.format(new Date())+"_0001_m_000000_0";
       Path outputPath = new Path(fileName);
       //todo 需要进一步确定TASK_ATTEMPT_ID
       conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
       FileOutputFormat outFormat = new TextOutputFormat();
       outFormat.setOutputPath(conf, outputPath);
       outFormat.setWorkOutputPath(conf, outputPath);
       if(null != compress) {
           Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
           if (null != codecClass) {
               outFormat.setOutputCompressorClass(conf, codecClass);
           }
       }
       
       writer = outFormat.getRecordWriter(fileSystem, conf, outputPath.toString(), Reporter.NULL);
}
 
Example #3
Source File: SliveTest.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}
 
Example #4
Source File: WordCountInput.java    From aerospike-hadoop with Apache License 2.0 6 votes vote down vote up
public int run(final String[] args) throws Exception {

        log.info("run starting");

        final Configuration conf = getConf();

        JobConf job = new JobConf(conf, WordCountInput.class);
        job.setJobName("AerospikeWordCountInput");

        job.setInputFormat(AerospikeInputFormat.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setOutputFormat(TextOutputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path(args[0]));

        JobClient.runJob(job);

        log.info("finished");
        return 0;
    }
 
Example #5
Source File: CartesianCommentComparison.java    From hadoop-map-reduce-patterns with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	if (args.length != 2) {
		System.err.println("Usage: CartesianCommentComparison <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}

	// Configure the join type
	JobConf conf = new JobConf("Cartesian Product");
	conf.setJarByClass(CartesianCommentComparison.class);
	conf.setMapperClass(CartesianMapper.class);
	conf.setNumReduceTasks(0);
	conf.setInputFormat(CartesianInputFormat.class);
	// Configure the input format
	CartesianInputFormat.setLeftInputInfo(conf, TextInputFormat.class, args[0]);
	CartesianInputFormat.setRightInputInfo(conf, TextInputFormat.class, args[0]);
	TextOutputFormat.setOutputPath(conf, new Path(args[1]));
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	RunningJob job = JobClient.runJob(conf);
	while (!job.isComplete()) {
		Thread.sleep(1000);
	}
	return job.isSuccessful() ? 0 : 1;
}
 
Example #6
Source File: SliveTest.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}
 
Example #7
Source File: WordCount.java    From attic-apex-malhar with Apache License 2.0 6 votes vote down vote up
public void run(String[] args) throws Exception
{

  JobConf conf = new JobConf(this.getClass());
  conf.setJobName("wordcount");

  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);

  conf.setMapperClass(Map.class);
  conf.setCombinerClass(Reduce.class);
  conf.setReducerClass(Reduce.class);

  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(TextOutputFormat.class);

  FileInputFormat.setInputPaths(conf, new Path(args[0]));
  FileOutputFormat.setOutputPath(conf, new Path(args[1]));

  JobClient.runJob(conf);
}
 
Example #8
Source File: HadoopMapredCompatWordCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
Example #9
Source File: CrawlDbReader.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);
  if (retry != null) job.setInt("retry", retry);
  
  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}
 
Example #10
Source File: MultipleTextOutputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
 
Example #11
Source File: TableMapReduceUtil.java    From hbase with Apache License 2.0 5 votes vote down vote up
/**
 * @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
 */
public static void addDependencyJars(JobConf job) throws IOException {
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
  org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJarsForClasses(
    job,
    job.getMapOutputKeyClass(),
    job.getMapOutputValueClass(),
    job.getOutputKeyClass(),
    job.getOutputValueClass(),
    job.getPartitionerClass(),
    job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
    job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
    job.getCombinerClass());
}
 
Example #12
Source File: HadoopMapredCompatWordCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
Example #13
Source File: HadoopWordCount1.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param jobConf Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner, boolean setReducer) {
    if (setMapper) {
        jobConf.setMapperClass(HadoopWordCount1Map.class);
        jobConf.setInputFormat(TextInputFormat.class);
    }

    if (setCombiner)
        jobConf.setCombinerClass(HadoopWordCount1Reduce.class);

    if (setReducer) {
        jobConf.setReducerClass(HadoopWordCount1Reduce.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
    }
}
 
Example #14
Source File: CompositeUserJoin.java    From hadoop-map-reduce-patterns with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
	if (args.length != 4) {
		printUsage();
	}
	Path userPath = new Path(args[0]);
	Path commentPath = new Path(args[1]);
	Path outputDir = new Path(args[2]);
	String joinType = args[3];
	JobConf conf = new JobConf("CompositeJoin");
	conf.setJarByClass(CompositeUserJoin.class);
	conf.setMapperClass(CompositeMapper.class);
	conf.setNumReduceTasks(0);
	// Set the input format class to a CompositeInputFormat class.
	// The CompositeInputFormat will parse all of our input files and output
	// records to our mapper.
	conf.setInputFormat(CompositeInputFormat.class);
	// The composite input format join expression will set how the records
	// are going to be read in, and in what input format.
	conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
			KeyValueTextInputFormat.class, userPath, commentPath));
	TextOutputFormat.setOutputPath(conf, outputDir);
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	RunningJob job = JobClient.runJob(conf);
	while (!job.isComplete()) {
		Thread.sleep(1000);
	}
	return job.isSuccessful() ? 0 : 1;
}
 
Example #15
Source File: CrawlDbReader.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);

  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}
 
Example #16
Source File: LogCountsPerHour.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception
{
  // Create a configuration
  Configuration conf = getConf();

  // Create a job from the default configuration that will use the WordCount class
  JobConf job = new JobConf(conf, LogCountsPerHour.class);

  // Define our input path as the first command line argument and our output path as the second
  Path in = new Path(args[0]);
  Path out = new Path(args[1]);

  // Create File Input/Output formats for these paths (in the job)
  FileInputFormat.setInputPaths(job, in);
  FileOutputFormat.setOutputPath(job, out);

  // Configure the job: name, mapper, reducer, and combiner
  job.setJobName("LogAveragePerHour");
  job.setMapperClass(LogMapClass.class);
  job.setReducerClass(LogReduce.class);
  job.setCombinerClass(LogReduce.class);

  // Configure the output
  job.setOutputFormat(TextOutputFormat.class);
  job.setOutputKeyClass(DateWritable.class);
  job.setOutputValueClass(IntWritable.class);

  // Run the job
  JobClient.runJob(job);
  return 0;
}
 
Example #17
Source File: WordCountWithHadoopOutputFormat.java    From stratosphere with Apache License 2.0 5 votes vote down vote up
@Override
public Plan getPlan(String... args) {
	// parse job parameters
	int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
	String dataInput = (args.length > 1 ? args[1] : "");
	String output    = (args.length > 2 ? args[2] : "");

	HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
			new TextInputFormat(), new JobConf(), "Input Lines");
	TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));


	MapOperator mapper = MapOperator.builder(new TokenizeLine())
			.input(source)
			.name("Tokenize Lines")
			.build();
	ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
			.input(mapper)
			.name("Count Words")
			.build();
	HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class);
	TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

	Plan plan = new Plan(out, "Hadoop OutputFormat Example");
	plan.setDefaultParallelism(numSubTasks);
	return plan;
}
 
Example #18
Source File: MultipleTextOutputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
 
Example #19
Source File: CopyFromS3.java    From emr-sample-apps with Apache License 2.0 5 votes vote down vote up
/**
 * This method constructs the JobConf to be used to run the map reduce job to
 * download the files from S3. This is a potentially expensive method since it
 * makes multiple calls to S3 to get a listing of all the input data. Clients
 * are encouraged to cache the returned JobConf reference and not call this
 * method multiple times unless necessary.
 * 
 * @return the JobConf to be used to run the map reduce job to download the
 *         files from S3.
 */
public JobConf getJobConf() throws IOException, ParseException {
  JobConf conf = new JobConf(CopyFromS3.class);
  conf.setJobName("CopyFromS3");
  conf.setOutputKeyClass(NullWritable.class);
  conf.setOutputValueClass(Text.class);
  conf.setMapperClass(S3CopyMapper.class);
  // We configure a reducer, even though we don't use it right now.
  // The idea is that, in the future we may. 
  conf.setReducerClass(HDFSWriterReducer.class);
  conf.setNumReduceTasks(0);

  FileInputFormat.setInputPaths(conf, new Path(tempFile));
  FileOutputFormat.setOutputPath(conf, new Path(outputPath));
  conf.setOutputFormat(TextOutputFormat.class);
  conf.setCompressMapOutput(true);

  JobClient jobClient = new JobClient(conf);

  FileSystem inputFS = FileSystem.get(URI.create(inputPathPrefix), conf);
  DatePathFilter datePathFilter = new DatePathFilter(startDate, endDate);
  List<Path> filePaths = getFilePaths(inputFS, new Path(inputPathPrefix), datePathFilter, jobClient.getDefaultMaps());

  // Write the file names to a temporary index file to be used
  // as input to the map tasks.
  FileSystem outputFS = FileSystem.get(URI.create(tempFile), conf);
  FSDataOutputStream outputStream = outputFS.create(new Path(tempFile), true);
  try {
    for (Path path : filePaths) {
      outputStream.writeBytes(path.toString() + "\n");
    }
  }
  finally {
    outputStream.close();
  }

  conf.setNumMapTasks(Math.min(filePaths.size(), jobClient.getDefaultMaps()));

  return conf;
}
 
Example #20
Source File: MultipleTextOutputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
 
Example #21
Source File: MultipleTextOutputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs, JobConf job,
    String name, Progressable arg3) throws IOException {
  if (theTextOutputFormat == null) {
    theTextOutputFormat = new TextOutputFormat<K, V>();
  }
  return theTextOutputFormat.getRecordWriter(fs, job, name, arg3);
}
 
Example #22
Source File: HadoopMapredCompatWordCount.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	if (args.length < 2) {
		System.err.println("Usage: WordCount <input path> <result path>");
		return;
	}

	final String inputPath = args[0];
	final String outputPath = args[1];

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// Set up the Hadoop Input Format
	HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
	TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

	// Create a Flink job with it
	DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

	DataSet<Tuple2<Text, LongWritable>> words =
			text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
				.groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));

	// Set up Hadoop Output Format
	HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
			new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
	hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
	TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

	// Output & Execute
	words.output(hadoopOutputFormat).setParallelism(1);
	env.execute("Hadoop Compat WordCount");
}
 
Example #23
Source File: TestUtils.java    From circus-train with Apache License 2.0 5 votes vote down vote up
public static Table createPartitionedTable(
    HiveMetaStoreClient metaStoreClient,
    String database,
    String table,
    URI location)
  throws Exception {
  return createPartitionedTable(metaStoreClient, database, table, location, DATA_COLUMNS, PARTITION_COLUMNS,
      "org.apache.hadoop.hive.serde2.OpenCSVSerde", TextInputFormat.class.getName(),
      TextOutputFormat.class.getName());
}
 
Example #24
Source File: TestUtils.java    From circus-train with Apache License 2.0 5 votes vote down vote up
public static Table createUnpartitionedTable(
    HiveMetaStoreClient metaStoreClient,
    String database,
    String table,
    URI location)
  throws TException {
  Table hiveTable = new Table();
  hiveTable.setDbName(database);
  hiveTable.setTableName(table);
  hiveTable.setTableType(TableType.EXTERNAL_TABLE.name());
  hiveTable.putToParameters("EXTERNAL", "TRUE");

  StorageDescriptor sd = new StorageDescriptor();
  sd.setCols(DATA_COLUMNS);
  sd.setLocation(location.toString());
  sd.setParameters(new HashMap<String, String>());
  sd.setInputFormat(TextInputFormat.class.getName());
  sd.setOutputFormat(TextOutputFormat.class.getName());
  sd.setSerdeInfo(new SerDeInfo());
  sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde");

  hiveTable.setSd(sd);

  metaStoreClient.createTable(hiveTable);

  ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table);
  ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L));
  ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData);
  List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1);
  metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj));

  return hiveTable;
}
 
Example #25
Source File: CountKmers.java    From emr-sample-apps with Apache License 2.0 4 votes vote down vote up
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException 
{
	String inpath = null;
	String outpath = null;
	int kmerlen = 0;
	int numMappers = 1;
	int numReducers = 1;
	int showpos = 0;
	
	int data = 1;
	
	if (data == 0)
	{	
		if (args.length != 6)
		{
			System.err.println("Usage: CountKmers filename outpath kmerlen showpos numMappers numReducers");
			return;
		}
		
		inpath      =                  args[0];
		outpath     =                  args[1];
		kmerlen     = Integer.parseInt(args[2]);
		showpos     = Integer.parseInt(args[3]);
		numMappers  = Integer.parseInt(args[4]);
		numReducers = Integer.parseInt(args[5]);
	}
	else if (data == 1)
	{
		inpath = "/user/guest/cloudburst/s_suis.br";
		outpath = "/user/mschatz/kmers";
		kmerlen = 12;
		showpos = 0;
		numMappers = 1;
		numReducers = 1;
	}
	
	System.out.println("inpath: " + inpath);
	System.out.println("outpath: " + outpath);
	System.out.println("kmerlen: " + kmerlen);
	System.out.println("showpos: " + showpos);
	System.out.println("nummappers: " + numMappers);
	System.out.println("numreducers: " + numReducers);
	
	JobConf conf = new JobConf(MerReduce.class);
	conf.setNumMapTasks(numMappers);
	conf.setNumReduceTasks(numReducers);
		
	conf.addInputPath(new Path(inpath));;
	conf.set("KMER_LEN", Integer.toString(kmerlen));
	conf.set("SHOW_POS", Integer.toString(showpos));
	
	conf.setInputFormat(SequenceFileInputFormat.class);
		
	conf.setMapOutputKeyClass(BytesWritable.class);
	conf.setMapOutputValueClass(IntWritable.class);
	//conf.setCompressMapOutput(true);
			
	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);
	conf.setOutputFormat(TextOutputFormat.class);
	
	conf.setMapperClass(MerMapClass.class);
	conf.setReducerClass(MerReduceClass.class);

	Path oPath = new Path(outpath);
	conf.setOutputPath(oPath);
	System.err.println("  Removing old results");
	FileSystem.get(conf).delete(oPath);
			
	conf.setJobName("CountMers");

	Timer t = new Timer();
	RunningJob rj = JobClient.runJob(conf);
	System.err.println("CountMers Finished");
	
	System.err.println("Total Running time was " + t.get());
	
	Counters counters = rj.getCounters( );
	Counters.Group task = counters.getGroup("org.apache.hadoop.mapred.Task$Counter");		
	long numDistinctMers = task.getCounter("REDUCE_INPUT_GROUPS");
	System.err.println("Num Distinct Mers: " + numDistinctMers);
}
 
Example #26
Source File: UnionExample.java    From tez with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = DAG.create("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.setBoolean("mapred.mapper.new-api", false);
  inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
  DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

  Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
      UnionProcessor.class.getName()), 1);

  Configuration outputConf = new Configuration(tezConf);
  outputConf.setBoolean("mapred.reducer.new-api", false);
  outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
  checkerVertex.addDataSink("union", od);
  

  Configuration allPartsConf = new Configuration(tezConf);
  DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
      TextOutputFormat.class, outputPath + "-all-parts").build();
  checkerVertex.addDataSink("all-parts", od2);

  Configuration partsConf = new Configuration(tezConf);    
  DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
      TextOutputFormat.class, outputPath + "-parts").build();
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  unionVertex.addDataSink("parts", od1);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              InputDescriptor.create(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}
 
Example #27
Source File: LinkRank.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * 
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb)
  throws IOException {

  // configure the counter job
  Path numLinksPath = new Path(webGraphDb, NUM_NODES);
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  JobConf counter = new NutchJob(getConf());
  counter.setJobName("LinkRank Counter");
  FileInputFormat.addInputPath(counter, nodeDb);
  FileOutputFormat.setOutputPath(counter, numLinksPath);
  counter.setInputFormat(SequenceFileInputFormat.class);
  counter.setMapperClass(Counter.class);
  counter.setCombinerClass(Counter.class);
  counter.setReducerClass(Counter.class);
  counter.setMapOutputKeyClass(Text.class);
  counter.setMapOutputValueClass(LongWritable.class);
  counter.setOutputKeyClass(Text.class);
  counter.setOutputValueClass(LongWritable.class);
  counter.setNumReduceTasks(1);
  counter.setOutputFormat(TextOutputFormat.class);
  counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the counter job, outputs to a single reduce task and file
  LOG.info("Starting link counter job");
  try {
    JobClient.runJob(counter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished link counter job");

  // read the first (and only) line from the file which should be the
  // number of links in the web graph
  LOG.info("Reading numlinks temp file");
  FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
  BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
  String numLinksLine = buffer.readLine();
  readLinks.close();
  
  // check if there are links to process, if none, webgraph might be empty
  if (numLinksLine == null || numLinksLine.length() == 0) {
    fs.delete(numLinksPath, true);
    throw new IOException("No links to process, is the webgraph empty?");
  }
  
  // delete temp file and convert and return the number of links as an int
  LOG.info("Deleting numlinks temp file");
  fs.delete(numLinksPath, true);
  String numLinks = numLinksLine.split("\\s+")[1];
  return Integer.parseInt(numLinks);
}
 
Example #28
Source File: NodeDumper.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
  throws Exception {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("NodeDumper: starting at " + sdf.format(start));
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  Configuration conf = getConf();

  JobConf dumper = new NutchJob(conf);
  dumper.setJobName("NodeDumper: " + webGraphDb);
  FileInputFormat.addInputPath(dumper, nodeDb);
  dumper.setInputFormat(SequenceFileInputFormat.class);

  if (nameType == null) {
    dumper.setMapperClass(Sorter.class);
    dumper.setReducerClass(Sorter.class);
    dumper.setMapOutputKeyClass(FloatWritable.class);
    dumper.setMapOutputValueClass(Text.class);
  } else {
    dumper.setMapperClass(Dumper.class);
    dumper.setReducerClass(Dumper.class);
    dumper.setMapOutputKeyClass(Text.class);
    dumper.setMapOutputValueClass(FloatWritable.class);
  }

  dumper.setOutputKeyClass(Text.class);
  dumper.setOutputValueClass(FloatWritable.class);
  FileOutputFormat.setOutputPath(dumper, output);

  if (asSequenceFile) {
    dumper.setOutputFormat(SequenceFileOutputFormat.class);
  } else {
    dumper.setOutputFormat(TextOutputFormat.class);
  }

  dumper.setNumReduceTasks(1);
  dumper.setBoolean("inlinks", type == DumpType.INLINKS);
  dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
  dumper.setBoolean("scores", type == DumpType.SCORES);

  dumper.setBoolean("host", nameType == NameType.HOST);
  dumper.setBoolean("domain", nameType == NameType.DOMAIN);
  dumper.setBoolean("sum", aggrType == AggrType.SUM);
  dumper.setBoolean("max", aggrType == AggrType.MAX);

  dumper.setLong("topn", topN);

  // Set equals-sign as separator for Solr's ExternalFileField
  if (asEff) {
    dumper.set("mapred.textoutputformat.separator", "=");
  }

  try {
    LOG.info("NodeDumper: running");
    JobClient.runJob(dumper);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  long end = System.currentTimeMillis();
  LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
 
Example #29
Source File: TestKeyFieldBasedComparator.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
public void configure(String keySpec, int expect) throws Exception {
  Path testdir = new Path("build/test/test.mapred.spill");
  Path inDir = new Path(testdir, "in");
  Path outDir = new Path(testdir, "out");
  FileSystem fs = getFileSystem();
  fs.delete(testdir, true);
  conf.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(conf, inDir);
  FileOutputFormat.setOutputPath(conf, outDir);
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(LongWritable.class);

  conf.setNumMapTasks(1);
  conf.setNumReduceTasks(2);

  conf.setOutputFormat(TextOutputFormat.class);
  conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
  conf.setKeyFieldComparatorOptions(keySpec);
  conf.setKeyFieldPartitionerOptions("-k1.1,1.1");
  conf.set("map.output.key.field.separator", " ");
  conf.setMapperClass(InverseMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  if (!fs.mkdirs(testdir)) {
    throw new IOException("Mkdirs failed to create " + testdir.toString());
  }
  if (!fs.mkdirs(inDir)) {
    throw new IOException("Mkdirs failed to create " + inDir.toString());
  }
  // set up input data in 2 files 
  Path inFile = new Path(inDir, "part0");
  FileOutputStream fos = new FileOutputStream(inFile.toString());
  fos.write((line1 + "\n").getBytes());
  fos.write((line2 + "\n").getBytes());
  fos.close();
  JobClient jc = new JobClient(conf);
  RunningJob r_job = jc.submitJob(conf);
  while (!r_job.isComplete()) {
    Thread.sleep(1000);
  }
  
  if (!r_job.isSuccessful()) {
    fail("Oops! The job broke due to an unexpected error");
  }
  Path[] outputFiles = FileUtil.stat2Paths(
      getFileSystem().listStatus(outDir,
      new OutputLogFilter()));
  if (outputFiles.length > 0) {
    InputStream is = getFileSystem().open(outputFiles[0]);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    String line = reader.readLine();
    //make sure we get what we expect as the first line, and also
    //that we have two lines (both the lines must end up in the same
    //reducer since the partitioner takes the same key spec for all
    //lines
    if (expect == 1) {
      assertTrue(line.startsWith(line1));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line2));
    }
    line = reader.readLine();
    if (expect == 1) {
      assertTrue(line.startsWith(line2));
    } else if (expect == 2) {
      assertTrue(line.startsWith(line1));
    }
    reader.close();
  }
}
 
Example #30
Source File: CrawlDbReader.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
      LOG.info("CrawlDb db: " + crawlDb);
    }

    Path outFolder = new Path(output);
    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }

  }