Java Code Examples for org.apache.hadoop.mapreduce.lib.input.TextInputFormat#setInputPaths()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.TextInputFormat#setInputPaths() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCount.java    From accumulo-examples with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
  Opts opts = new Opts();
  opts.parseArgs(WordCount.class.getName(), args);

  // Create Accumulo table and attach Summing iterator
  try (AccumuloClient client = opts.createAccumuloClient()) {
    client.tableOperations().create(opts.tableName);
    IteratorSetting is = new IteratorSetting(10, SummingCombiner.class);
    SummingCombiner.setColumns(is,
        Collections.singletonList(new IteratorSetting.Column("count")));
    SummingCombiner.setEncodingType(is, SummingCombiner.Type.STRING);
    client.tableOperations().attachIterator(opts.tableName, is);
  } catch (TableExistsException e) {
    // ignore
  }

  // Create M/R job
  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(WordCount.class.getName());
  job.setJarByClass(WordCount.class);
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.setInputPaths(job, new Path(opts.inputDirectory));

  job.setMapperClass(MapClass.class);
  job.setNumReduceTasks(0);
  job.setOutputFormatClass(AccumuloOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Mutation.class);

  if (opts.hdfsPath != null) {
    AccumuloOutputFormat.configure().clientPropertiesPath(opts.hdfsPath)
        .defaultTable(opts.tableName).store(job);
  } else {
    AccumuloOutputFormat.configure().clientProperties(opts.getClientProperties())
        .defaultTable(opts.tableName).store(job);
  }
  System.exit(job.waitForCompletion(true) ? 0 : 1);
}
 
Example 2
Source File: Mapper2HbaseDemo.java    From bigdata-tutorial with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {
	String input = args[0];
	Configuration conf = getConf();
	Job job = Job.getInstance(conf, Mapper2HbaseDemo.class.getSimpleName());
	job.setJarByClass(Mapper2HbaseDemo.class);
	job.setMapperClass(Map.class);
	job.setNumReduceTasks(0);
	job.setInputFormatClass(TextInputFormat.class);
	TextInputFormat.setInputPaths(job, input);
	job.setOutputFormatClass(NullOutputFormat.class);
	return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 3
Source File: TestNewCombinerGrouping.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testCombiner() throws Exception {
  if (!new File(TEST_ROOT_DIR).mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
  }
  File in = new File(TEST_ROOT_DIR, "input");
  if (!in.mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + in);
  }
  File out = new File(TEST_ROOT_DIR, "output");
  PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
  pw.println("A|a,1");
  pw.println("A|b,2");
  pw.println("B|a,3");
  pw.println("B|b,4");
  pw.println("B|c,5");
  pw.close();
  JobConf conf = new JobConf();
  conf.set("mapreduce.framework.name", "local");
  Job job = new Job(conf);
  TextInputFormat.setInputPaths(job, new Path(in.getPath()));
  TextOutputFormat.setOutputPath(job, new Path(out.getPath()));

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setGroupingComparatorClass(GroupComparator.class);

  job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
  job.setCombinerClass(Combiner.class);
  job.getConfiguration().setInt("min.num.spills.for.combine", 0);

  job.submit();
  job.waitForCompletion(false);
  if (job.isSuccessful()) {
    Counters counters = job.getCounters();

    long combinerInputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_INPUT_RECORDS").getValue();
    long combinerOutputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_OUTPUT_RECORDS").getValue();
    Assert.assertTrue(combinerInputRecords > 0);
    Assert.assertTrue(combinerInputRecords > combinerOutputRecords);

    BufferedReader br = new BufferedReader(new FileReader(
        new File(out, "part-r-00000")));
    Set<String> output = new HashSet<String>();
    String line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNull(line);
    br.close();

    Set<String> expected = new HashSet<String>();
    expected.add("A2");
    expected.add("B5");

    Assert.assertEquals(expected, output);

  } else {
    Assert.fail("Job failed");
  }
}
 
Example 4
Source File: TestNewCombinerGrouping.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testCombiner() throws Exception {
  if (!new File(TEST_ROOT_DIR).mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
  }
  File in = new File(TEST_ROOT_DIR, "input");
  if (!in.mkdirs()) {
    throw new RuntimeException("Could not create test dir: " + in);
  }
  File out = new File(TEST_ROOT_DIR, "output");
  PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
  pw.println("A|a,1");
  pw.println("A|b,2");
  pw.println("B|a,3");
  pw.println("B|b,4");
  pw.println("B|c,5");
  pw.close();
  JobConf conf = new JobConf();
  conf.set("mapreduce.framework.name", "local");
  Job job = new Job(conf);
  TextInputFormat.setInputPaths(job, new Path(in.getPath()));
  TextOutputFormat.setOutputPath(job, new Path(out.getPath()));

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);
  job.setInputFormatClass(TextInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputFormatClass(TextOutputFormat.class);
  job.setGroupingComparatorClass(GroupComparator.class);

  job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
  job.setCombinerClass(Combiner.class);
  job.getConfiguration().setInt("min.num.spills.for.combine", 0);

  job.submit();
  job.waitForCompletion(false);
  if (job.isSuccessful()) {
    Counters counters = job.getCounters();

    long combinerInputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_INPUT_RECORDS").getValue();
    long combinerOutputRecords = counters.findCounter(
        "org.apache.hadoop.mapreduce.TaskCounter",
        "COMBINE_OUTPUT_RECORDS").getValue();
    Assert.assertTrue(combinerInputRecords > 0);
    Assert.assertTrue(combinerInputRecords > combinerOutputRecords);

    BufferedReader br = new BufferedReader(new FileReader(
        new File(out, "part-r-00000")));
    Set<String> output = new HashSet<String>();
    String line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNotNull(line);
    output.add(line.substring(0, 1) + line.substring(4, 5));
    line = br.readLine();
    Assert.assertNull(line);
    br.close();

    Set<String> expected = new HashSet<String>();
    expected.add("A2");
    expected.add("B5");

    Assert.assertEquals(expected, output);

  } else {
    Assert.fail("Job failed");
  }
}
 
Example 5
Source File: BulkIngestExample.java    From accumulo-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  ClientOpts opts = new ClientOpts();
  opts.parseArgs(BulkIngestExample.class.getName(), args);

  Job job = Job.getInstance(opts.getHadoopConfig());
  job.setJobName(BulkIngestExample.class.getSimpleName());
  job.setJarByClass(BulkIngestExample.class);

  job.setInputFormatClass(TextInputFormat.class);

  job.setMapperClass(MapClass.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  job.setReducerClass(ReduceClass.class);
  job.setOutputFormatClass(AccumuloFileOutputFormat.class);

  TextInputFormat.setInputPaths(job, new Path(inputDir));
  AccumuloFileOutputFormat.configure().outputPath(new Path(workDir + "/files")).store(job);

  FileSystem fs = FileSystem.get(opts.getHadoopConfig());
  try (AccumuloClient client = opts.createAccumuloClient()) {

    try (PrintStream out = new PrintStream(
        new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt"))))) {
      Collection<Text> splits = client.tableOperations().listSplits(SetupTable.tableName, 100);
      for (Text split : splits)
        out.println(Base64.getEncoder().encodeToString(split.copyBytes()));
      job.setNumReduceTasks(splits.size() + 1);
    }

    job.setPartitionerClass(RangePartitioner.class);
    RangePartitioner.setSplitFile(job, workDir + "/splits.txt");

    job.waitForCompletion(true);
    Path failures = new Path(workDir, "failures");
    fs.delete(failures, true);
    fs.mkdirs(new Path(workDir, "failures"));
    // With HDFS permissions on, we need to make sure the Accumulo user can read/move the rfiles
    FsShell fsShell = new FsShell(opts.getHadoopConfig());
    fsShell.run(new String[] {"-chmod", "-R", "777", workDir});
    client.tableOperations().importDirectory(workDir + "/files").to(SetupTable.tableName).load();
  }
  System.exit(job.isSuccessful() ? 0 : 1);
}
 
Example 6
Source File: DataValidationJobExecutor.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * The main method.
 *
 * @param args
 *            the arguments
 * @throws JumbuneException
 *             the hTF exception
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 * @throws InterruptedException
 *             the interrupted exception
 * @throws ClassNotFoundException
 *             the class not found exception
 */
public static void main(String[] args)
		throws JumbuneException, IOException, InterruptedException, ClassNotFoundException {
	
	Configuration conf = new Configuration();		
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();		
	
	LOGGER.debug("Data validation job received args length [ " + otherArgs.length + "]"
			+ "and values respectively [" + otherArgs[0] + "], " + otherArgs[1] + ", [" + otherArgs[2] + "]" + ", ["
			+ otherArgs[3] + "]");
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;
	
	dynamicArgs = ((otherArgs.length)-1);		
	
	for (int j = dynamicArgs; j < otherArgs.length; j++) {
		sb.append(otherArgs[j]);
	}
	String outputPath = DataValidationConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0];		
	String slaveFileLoc = otherArgs[1];		
	String maxViolations = otherArgs[2];
	String noOfReducers = otherArgs[3];		
	String dvBeanString = sb.toString();
	LOGGER.debug("Received dvBean value [" + dvBeanString + "]");
	
	Gson gson = new Gson();
	Type type = new TypeToken<DataValidationBean>() {
	}.getType();
	DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type);
	String recordSeparator = dataValidationBean.getRecordSeparator();
	List<FieldValidationBean> fieldValidationList = dataValidationBean.getFieldValidationList();
	boolean[][] validateArray = new boolean[3][dataValidationBean.getNumOfFields()];
	for (FieldValidationBean fieldValidationBean : fieldValidationList) {
		validateArray[DataValidationConstants.NULL_MATRIX][(fieldValidationBean.getFieldNumber() - 1)] = validate(
				fieldValidationBean.getNullCheck());
		validateArray[DataValidationConstants.DATA_TYPE_MATRIX][(fieldValidationBean.getFieldNumber()
				- 1)] = validate(fieldValidationBean.getDataType());
		validateArray[DataValidationConstants.REGEX_MATRIX][(fieldValidationBean.getFieldNumber() - 1)] = validate(
				fieldValidationBean.getRegex());
	}
	conf.set(DataValidationConstants.VALIDATE_MATRIX, new Gson().toJson(validateArray));
	conf.set(DataValidationConstants.DATA_VALIDATION_BEAN_STRING, dvBeanString);
	conf.set(DataValidationConstants.RECORD_SEPARATOR, recordSeparator);
	conf.set(DataValidationConstants.SLAVE_FILE_LOC, slaveFileLoc);

	conf.setInt(DataValidationConstants.DV_NUM_REPORT_VIOLATION, Integer.parseInt(maxViolations));
	conf.set(TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);

	/* Partitioner specific configuration */	
			
	conf.setBoolean(DataValidationConstants.CONFIGURE_RR_FOR_PARTITION_NO, true);
	conf.setBoolean(DataValidationConstants.CONFIGURE_RR_FOR_REDUCERS, true);

	Job job = new Job(conf, DataValidationConstants.JOB_NAME);
	job.setJarByClass(DataValidationJobExecutor.class);
	job.setMapperClass(DataValidationMapper.class);
	job.setReducerClass(DataValidationReducer.class);
	job.setPartitionerClass(DataValidationPartitioner.class);
	job.setNumReduceTasks(Integer.parseInt(noOfReducers));
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(DataDiscrepanciesArrayWritable.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(DataViolationWritable.class);

	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);
	
	TextInputFormat.setInputPaths(job, inputPaths);
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	LOGGER.debug("Job execution Started");
	job.waitForCompletion(true);

	LOGGER.debug("Job completion over,going to read the result from hdfs");
	Map<String, DataValidationReport> jsonMap = readDataFromHdfs(conf, outputPath);
	final Gson dvReportGson = new Gson();

	final String jsonString = dvReportGson.toJson(jsonMap);
	LOGGER.info("Completed DataValidation");
	LOGGER.info(DataValidationConstants.DV_REPORT + jsonString);
}
 
Example 7
Source File: DataProfNoCriteriaJobExecutor.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
/***
 * main method for job execution
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{

	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0].replace(" ", "");
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);

	Job job = new Job(conf, DataProfilingConstants.JOB_NAME);

	job.setJarByClass(DataProfNoCriteriaJobExecutor.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
	job.setMapperClass(DataProfNoCriteriaMapper.class);
	job.setCombinerClass(DataProfNoCriteriaReducer.class);
	job.setReducerClass(DataProfNoCriteriaReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	
	Map<String, Integer> sortedMap = readJobOutputFromHdfs(conf,outputPath);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(sortedMap);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}
 
Example 8
Source File: DataProfilingJobExecutor.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * @param args
 */
public static void main(String[] args) throws IOException, InterruptedException , ClassNotFoundException{
	
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
	LOGGER.debug("Data Profiling job values respectively ["+otherArgs[0]+"], "+
			 otherArgs[1]);
	StringBuilder sb = new StringBuilder();
	
	int dynamicArgs = 0;		
	dynamicArgs = ((otherArgs.length)-1);
	
	for (int i = dynamicArgs; i < otherArgs.length; i++) {
		LOGGER.debug("other arguments" + otherArgs[i]);
		sb.append(otherArgs[i]);
	}
	
	String outputPath = DataProfilingConstants.OUTPUT_DIR_PATH + new Date().getTime();
	String inputPath = otherArgs[0];
	String dpBeanString = sb.toString();
	LOGGER.debug("Received dpBean value [" + dpBeanString+"]");
	Gson gson = new Gson();
	Type type = new TypeToken<DataProfilingBean>() {
	}.getType();
	
	DataProfilingBean dataProfilingBean = gson.fromJson(dpBeanString, type);
	String recordSeparator = dataProfilingBean.getRecordSeparator();
	conf.set(DataProfilingConstants.DATA_PROFILING_BEAN, dpBeanString);
	conf.set(DataProfilingConstants.RECORD_SEPARATOR, recordSeparator);
	
	conf.set(DataProfilingConstants.TEXTINPUTFORMAT_RECORD_DELIMITER, recordSeparator);
	
	Job job = new Job(conf,DataProfilingConstants.JOB_NAME);
	
	job.setJarByClass(DataProfilingJobExecutor.class);
	job.setMapperClass(DataProfilingMapper.class);
	
	job.setCombinerClass(DataProfilingReducer.class);
	job.setReducerClass(DataProfilingReducer.class);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(IntWritable.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);
	
   	Path[] inputPaths = FileUtil.getAllNestedFilePath(job, inputPath);		
	
	TextInputFormat.setInputPaths(job, inputPaths);
	SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.waitForCompletion(true);
	LOGGER.debug("Job completed , now going to read the result from hdfs");
	Set<CriteriaBasedDataProfiling> criteriaBasedDataProfilings = readJobOutputFromHdfs(conf,outputPath,dataProfilingBean);
	final Gson dpReportGson = new GsonBuilder().disableHtmlEscaping().create();

	final String jsonString = dpReportGson.toJson(criteriaBasedDataProfilings);
	LOGGER.info(DataProfilingConstants.DATA_PROFILING_REPORT + jsonString);
}
 
Example 9
Source File: SSTableExport.java    From aegisthus with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(SSTableExport.class);
    CommandLine cl = getOptions(args);
    if (cl == null) {
        return 1;
    }

    // Check all of the paths and load the sstable version from the input filenames
    List<Path> paths = Lists.newArrayList();
    if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) {
        for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) {
            checkVersionFromFilename(input);
            paths.add(new Path(input));
        }
    }
    if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) {
        paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR)));
    }

    String avroSchemaString = getAvroSchema(cl.getOptionValue(Feature.CMD_ARG_AVRO_SCHEMA_FILE), job.getConfiguration());
    Schema avroSchema = new Schema.Parser().parse(avroSchemaString);

    // At this point we have the version of sstable that we can use for this run
    job.getConfiguration().set(Aegisthus.Feature.CONF_SSTABLE_VERSION, version.toString());

    if (job.getConfiguration().get(Aegisthus.Feature.CONF_CQL_SCHEMA) != null) {
        setConfigurationFromCql(job.getConfiguration());
    }

    job.setInputFormatClass(AegisthusInputFormat.class);
    job.setMapperClass(CQLMapper.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    AvroJob.setOutputKeySchema(job, avroSchema);

    // Map-only job
    job.setNumReduceTasks(0);

    TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
    FileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR)));

    job.submit();
    System.out.println(job.getJobID());
    System.out.println(job.getTrackingURL());
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}
 
Example 10
Source File: Aegisthus.java    From aegisthus with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    Configuration configuration = job.getConfiguration();

    job.setJarByClass(Aegisthus.class);
    CommandLine cl = getOptions(args);
    if (cl == null) {
        return 1;
    }

    // Check all of the paths and load the sstable version from the input filenames
    List<Path> paths = Lists.newArrayList();
    if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) {
        for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) {
            paths.add(new Path(input));
        }
    }
    if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) {
        paths.addAll(getDataFiles(configuration, cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR)));
    }
    LOG.info("Processing paths: {}", paths);

    // At this point we have the version of sstable that we can use for this run
    Descriptor.Version version = Descriptor.Version.CURRENT;
    if (cl.hasOption(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION)) {
        version = new Descriptor.Version(cl.getOptionValue(Feature.CMD_ARG_SSTABLE_OUTPUT_VERSION));
    }
    configuration.set(Feature.CONF_SSTABLE_VERSION, version.toString());

    if (configuration.get(Feature.CONF_CQL_SCHEMA) != null) {
        setConfigurationFromCql(configuration);
    }

    if(cl.hasOption(Feature.CMD_ARG_COMBINE_SPLITS)) {
        job.setInputFormatClass(AegisthusCombinedInputFormat.class);
    } else {
        job.setInputFormatClass(AegisthusInputFormat.class);
    }
    job.setMapOutputKeyClass(AegisthusKey.class);
    job.setMapOutputValueClass(AtomWritable.class);
    job.setOutputKeyClass(AegisthusKey.class);
    job.setOutputValueClass(RowWritable.class);
    job.setMapperClass(AegisthusKeyMapper.class);
    job.setReducerClass(CassSSTableReducer.class);
    job.setGroupingComparatorClass(AegisthusKeyGroupingComparator.class);
    job.setPartitionerClass(AegisthusKeyPartitioner.class);
    job.setSortComparatorClass(AegisthusKeySortingComparator.class);

    TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));

    if (cl.hasOption(Feature.CMD_ARG_PRODUCE_SSTABLE)) {
        job.setOutputFormatClass(SSTableOutputFormat.class);
    } else {
        job.setOutputFormatClass(JsonOutputFormat.class);
    }
    CustomFileNameFileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR)));

    job.submit();
    if (configuration.getBoolean(Feature.CONF_SHUTDOWN_HOOK, true)) {
        Runtime.getRuntime().addShutdownHook(new JobKiller(job));
    }

    System.out.println(job.getJobID());
    System.out.println(job.getTrackingURL());
    boolean success = job.waitForCompletion(true);

    if (success) {
        Counter errorCounter = job.getCounters().findCounter("aegisthus", "error_skipped_input");
        long errorCount = errorCounter != null ? errorCounter.getValue() : 0L;
        int maxAllowed = configuration.getInt(Feature.CONF_MAX_CORRUPT_FILES_TO_SKIP, 0);
        if (errorCounter != null && errorCounter.getValue() > maxAllowed) {
            LOG.error("Found {} corrupt files which is greater than the max allowed {}", errorCount, maxAllowed);
            success = false;
        } else if (errorCount > 0) {
            LOG.warn("Found {} corrupt files but not failing the job because the max allowed is {}",
                    errorCount, maxAllowed);
        }
    }

    return success ? 0 : 1;
}
 
Example 11
Source File: Distcp.java    From aegisthus with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

	CommandLine cl = getOptions(args);
	if (cl == null) {
		return 1;
	}

	Job job = initializeJob();
	String outputDir = cl.getOptionValue(OUTPUT);
	StorageHelper helper = new StorageHelper(job.getConfiguration());
	helper.setFinalPath(outputDir);

	checkOutputDirectory(job, outputDir, cl.hasOption(OPT_OVERWRITE));

	job.getConfiguration().setBoolean(CFG_PREFIX + OPT_PRIAM, cl.hasOption(OPT_PRIAM));
	if (cl.hasOption(OPT_RECURSIVE)) {
		job.getConfiguration().set(CFG_PREFIX + OPT_RECURSIVE, cleanS3(cl.getOptionValue(OPT_RECURSIVE)));
	}

	String pathTemp = String.format("/tmp/%s", UUID.randomUUID().toString());
	LOG.info(String.format("writing to %s", pathTemp));

	Path tmp = new Path("/tmp");
	FileSystem fs = tmp.getFileSystem(job.getConfiguration());
	fs.mkdirs(new Path(pathTemp));
	Path inputPath = new Path(new Path(pathTemp), "input.txt");
	Path tmpPath = new Path(new Path(pathTemp), "out");

	int fileCount = setupInput(	job,
								inputPath,
								cl.getOptionValues(OPT_INPUT_FILE),
								cl.getOptionValue(OPT_MANIFEST_IN));
	setReducers(job, fileCount);

	TextInputFormat.setInputPaths(job, inputPath.toUri().toString());
	FileOutputFormat.setOutputPath(job, tmpPath);

	boolean success = runJob(job, cl);
	// TODO: output manifest
	/*
	 * if (success && cl.hasOption(OPT_MANIFEST_OUT)) { writeManifest(job,
	 * files); }
	 */
	fs.delete(new Path(pathTemp), true);
	return success ? 0 : 1;
}
 
Example 12
Source File: TotalOrderSortingStage.java    From hadoop-map-reduce-patterns with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	Path inputPath = new Path(args[0]);
	Path partitionFile = new Path(args[1] + "_partitions.lst");
	Path outputStage = new Path(args[1] + "_staging");
	Path outputOrder = new Path(args[1]);
	// Configure job to prepare for sampling
	Job sampleJob = new Job(conf, "TotalOrderSortingStage");
	sampleJob.setJarByClass(TotalOrderSortingStage.class);
	// Use the mapper implementation with zero reduce tasks
	sampleJob.setMapperClass(LastAccessMapper.class);
	sampleJob.setNumReduceTasks(0);
	sampleJob.setOutputKeyClass(Text.class);
	sampleJob.setOutputValueClass(Text.class);
	TextInputFormat.setInputPaths(sampleJob, inputPath);
	// Set the output format to a sequence file
	sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
	// Submit the job and get completion code.
	int code = sampleJob.waitForCompletion(true) ? 0 : 1;
	if (code == 0) {
		Job orderJob = new Job(conf, "TotalOrderSortingStage");
		orderJob.setJarByClass(TotalOrderSortingStage.class);
		// Here, use the identity mapper to output the key/value pairs in
		// the SequenceFile
		orderJob.setMapperClass(Mapper.class);
		orderJob.setReducerClass(ValuesReducer.class);
		// Set the number of reduce tasks to an appropriate number for the
		// amount of data being sorted
		orderJob.setNumReduceTasks(10);
		// Use Hadoop's TotalOrderPartitioner class
		orderJob.setPartitionerClass(TotalOrderPartitioner.class);
		// Set the partition file
		TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
				partitionFile);
		orderJob.setOutputKeyClass(Text.class);
		orderJob.setOutputValueClass(Text.class);
		// Set the input to the previous job's output
		orderJob.setInputFormatClass(SequenceFileInputFormat.class);
		SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
		// Set the output path to the command line parameter
		TextOutputFormat.setOutputPath(orderJob, outputOrder);
		// Set the separator to an empty string
		orderJob.getConfiguration().set(
				"mapred.textoutputformat.separator", "");
		// Use the InputSampler to go through the output of the previous
		// job, sample it, and create the partition file
		InputSampler.writePartitionFile(orderJob,
				new InputSampler.RandomSampler(.001, 10000));
		// Submit the job
		code = orderJob.waitForCompletion(true) ? 0 : 2;
	}
	// Clean up the partition file and the staging directory
	FileSystem.get(new Configuration()).delete(partitionFile, false);
	FileSystem.get(new Configuration()).delete(outputStage, true);
	return code;
}