Java Code Examples for org.apache.hadoop.mapreduce.Job#setInputFormatClass()

The following examples show how to use org.apache.hadoop.mapreduce.Job#setInputFormatClass() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: Step5.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job five");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step5.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper5.class);  
    job1.setMapOutputKeyClass(VarLongWritable.class);  
    job1.setMapOutputValueClass(VectorWritable.class);  
      
    job1.setCombinerClass(WiKiCombiner5.class);  
    job1.setReducerClass(WiKiReducer5.class);  
    job1.setOutputKeyClass(VarLongWritable.class);  
    job1.setOutputValueClass(RecommendedItemsWritable.class);  
//   job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
  
    FileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}

Example 2

Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: ContentLoader configFile inputDir");
        System.exit(2);
    }
    
    Job job = Job.getInstance(conf);
    job.setJarByClass(BinaryLoader.class);
    job.setInputFormatClass(BinaryInputFormat.class);
    job.setMapperClass(ContentMapper.class);
    job.setMapOutputKeyClass(DocumentURI.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(ContentOutputFormat.class);
    
    BinaryInputFormat.setInputPaths(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
     
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Example 3

Source File: WordDistributionStatisticsCollector.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(WordDistributionStatisticsCollector.class);
    job.setJobName(WordDistributionStatisticsCollector.class.getName());

    // mapper
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // reducer
    job.setReducerClass(SumReducer.class);
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Example 4

Source File: TestReflectInputOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createParquetFile() throws Exception {
  // set up readers and writers not in MR
  conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
  AvroWriteSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);

  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    final Job job = new Job(conf, "write");

    // input not really used
    TextInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TestReflectInputOutputFormat.MyMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setOutputPath(job, parquetPath);
    AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA);
    AvroParquetOutputFormat.setAvroDataSupplier(job, ReflectDataSupplier.class);

    waitForJob(job);
  }
}

Example 5

Source File: WARCRecordCounter.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance(conf);
    job.setJarByClass(WARCRecordCounter.class);

    job.setJobName(WARCRecordCounter.class.getName());

    // mapper
    job.setMapperClass(ResponseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // combiner + reducer
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Example 6

Source File: CompactionOrcJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void configureMapper(Job job) {
  job.setInputFormatClass(OrcValueCombineFileInputFormat.class);
  job.setMapperClass(OrcValueMapper.class);
  job.setMapOutputKeyClass(OrcKey.class);
  job.setMapOutputValueClass(OrcValue.class);
  job.setGroupingComparatorClass(OrcKeyComparator.class);
  job.setSortComparatorClass(OrcKeyComparator.class);
}

Example 7

Source File: Main.java From hiped2 with Apache License 2.0

5 votes

public static double calcPageRank(Path inputPath, Path outputPath, int numNodes)
    throws Exception {
  Configuration conf = new Configuration();
  conf.setInt(Reduce.CONF_NUM_NODES_GRAPH, numNodes);

  Job job = new Job(conf);
  job.setJarByClass(Main.class);
  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }

  long summedConvergence = job.getCounters().findCounter(
      Reduce.Counter.CONV_DELTAS).getValue();
  double convergence =
      ((double) summedConvergence /
          Reduce.CONVERGENCE_SCALING_FACTOR) /
          (double) numNodes;

  System.out.println("======================================");
  System.out.println("=  Num nodes:           " + numNodes);
  System.out.println("=  Summed convergence:  " + summedConvergence);
  System.out.println("=  Convergence:         " + convergence);
  System.out.println("======================================");

  return convergence;
}

Example 8

Source File: JMatrixMultiplicationStep2.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step8InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step8OutputPath");

	Configuration conf = new Configuration();

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step2_Mapper.class);
	job.setReducerClass(Step2_Reducer.class);
	job.setJarByClass(JMatrixMultiplicationStep2.class);
	job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(NullWritable.class);
	job.setOutputValueClass(Text.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Example 9

Source File: DataDrivenDBInputFormat.java From hadoop with Apache License 2.0

5 votes

/** setInput() takes a custom query and a separate "bounding query" to use
    instead of the custom "count query" used by DBInputFormat.
  */
public static void setInput(Job job,
    Class<? extends DBWritable> inputClass,
    String inputQuery, String inputBoundingQuery) {
  DBInputFormat.setInput(job, inputClass, inputQuery, "");
  job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
  job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

Example 10

Source File: AvroConversionBaseCreator.java From datacollector with Apache License 2.0

5 votes

@Override
public Job call() throws Exception {
  // We're explicitly disabling speculative execution
  conf.set("mapreduce.map.speculative", "false");
  conf.set("mapreduce.map.maxattempts", "1");

  conf.set("mapreduce.job.user.classpath.first", "true");
  conf.set("mapreduce.task.classpath.user.precedence", "true");
  conf.set("mapreduce.task.classpath.first", "true");

  addNecessaryJarsToJob(conf);

  Job job = Job.getInstance(conf);

  // IO formats
  job.setInputFormatClass(getInputFormatClass());
  job.setOutputFormatClass(NullOutputFormat.class);

  // Mapper & job output
  job.setMapperClass(getMapperClass());
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);

  // It's map only job
  job.setNumReduceTasks(0);

  // General configuration
  job.setJarByClass(getClass());


  return job;
}

Example 11

Source File: PhoenixMapReduceUtil.java From phoenix with Apache License 2.0

5 votes

/**
 * 
 * @param job
 * @param inputClass DBWritable class
 * @param tableName  Input table name
 * @param conditions Condition clause to be added to the WHERE clause.
 * @param fieldNames fields being projected for the SELECT query.
 */
public static void setInput(final Job job, final Class<? extends DBWritable> inputClass, final String tableName , final String conditions, final String... fieldNames) {
      job.setInputFormatClass(PhoenixInputFormat.class);
      final Configuration configuration = job.getConfiguration();
      PhoenixConfigurationUtil.setInputTableName(configuration, tableName);
      PhoenixConfigurationUtil.setSelectColumnNames(configuration,fieldNames);
      PhoenixConfigurationUtil.setInputClass(configuration,inputClass);
      PhoenixConfigurationUtil.setSchemaType(configuration, SchemaType.TABLE);
}

Example 12

Source File: KafkaMRInput.java From kylin with Apache License 2.0

5 votes

@Override
public void configureJob(Job job) {
    job.setInputFormatClass(SequenceFileInputFormat.class);
    String jobId = job.getConfiguration().get(BatchConstants.ARG_CUBING_JOB_ID);
    IJoinedFlatTableDesc flatHiveTableDesc = new CubeJoinedFlatTableDesc(cubeSegment);
    String inputPath = JoinedFlatTable.getTableDir(flatHiveTableDesc,
            JobBuilderSupport.getJobWorkingDir(conf, jobId));
    try {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

Example 13

Source File: JsonDataValidationExecutor.java From jumbune with GNU Lesser General Public License v3.0

4 votes

public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
  {
  	Configuration conf = new Configuration();	
  	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
StringBuilder sb = new StringBuilder();
  	for (int j = 2; j < otherArgs.length; j++) {
	
  		sb.append(otherArgs[j]);
}
  	
  	LOGGER.debug("Arguments[ " + otherArgs.length+"]"+"and values respectively ["+otherArgs[0]+"], "+
		otherArgs[1]+", ["+otherArgs[2]+"]"+", ["+otherArgs[3]+"],"+
		otherArgs[4]);

String inputpath = otherArgs[0];
String outputpath = "/tmp/jumbune/dvjsonreport"+  new Date().getTime();

String json = otherArgs[1];
String nullCondition = otherArgs[2];
String regex = otherArgs[3];
String dvDir = otherArgs[4];



if(regex.isEmpty()){
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, regex);
}

if(nullCondition.isEmpty()){
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, nullCondition);
}


conf.set(JsonDataVaildationConstants.SLAVE_DIR, dvDir);
conf.set(JsonDataVaildationConstants.JSON_ARGUMENT, json);
FileSystem fs = FileSystem.get(conf);

@SuppressWarnings("deprecation")
Job job = new Job(conf, "JSONDataValidation");
job.setJarByClass(JsonDataValidationExecutor.class);

job.setInputFormatClass(JsonFileInputFormat.class);

job.setMapperClass(JsonDataValidationMapper.class);
job.setPartitionerClass(JsonDataValidationPartitioner.class);
job.setReducerClass(JsonDataValidationReducer.class);
job.setNumReduceTasks(5);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FileKeyViolationBean.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TotalReducerViolationBean.class);
	
job.setOutputFormatClass(SequenceFileOutputFormat.class);

  	Path[] inputPaths = FileUtil.getAllJsonNestedFilePath(job, inputpath);

FileInputFormat.setInputPaths(job, inputPaths);
FileOutputFormat.setOutputPath(job, new Path(outputpath));
		
if(fs.exists(new Path(outputpath)))
{
	fs.delete(new Path(outputpath), true);
}

job.waitForCompletion(true);	

 Map<String, JsonViolationReport> jsonMap = readDataFromHdfs(conf,outputpath);
 final Gson gson= new Gson();
 final String jsonReport = gson.toJson(jsonMap);

 LOGGER.info("Completed DataValidation");
 LOGGER.info(JsonDataVaildationConstants.JSON_DV_REPORT + jsonReport);
  }

Example 14

Source File: TopKPhaseJob.java From incubator-pinot with Apache License 2.0

4 votes

public Job run() throws Exception {
  Job job = Job.getInstance(getConf());
  job.setJobName(name);
  job.setJarByClass(TopKPhaseJob.class);

  Configuration configuration = job.getConfiguration();
  FileSystem fs = FileSystem.get(configuration);

  // Properties
  LOGGER.info("Properties {}", props);

   // Input Path
  String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH);
  LOGGER.info("Input path dir: " + inputPathDir);
  for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
    LOGGER.info("Adding input:" + inputPath);
    Path input = new Path(inputPath);
    FileInputFormat.addInputPath(job, input);
  }

  // Output path
  Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH));
  LOGGER.info("Output path dir: " + outputPath.toString());
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }
  FileOutputFormat.setOutputPath(job, outputPath);

  // Schema
  Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
  LOGGER.info("Schema : {}", avroSchema.toString(true));

  // ThirdEyeConfig
  String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty);
  String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
  ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
  LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
  job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

  // Map config
  job.setMapperClass(TopKPhaseMapper.class);
  job.setInputFormatClass(AvroKeyInputFormat.class);
  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);

  // Combiner
  job.setCombinerClass(TopKPhaseCombiner.class);

   // Reduce config
  job.setReducerClass(TopKPhaseReducer.class);
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);
  job.setNumReduceTasks(1);

  job.waitForCompletion(true);

  return job;
}

Example 15

Source File: DomainStatistics.java From anthelion with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 3) {
    System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]");
    return 1;
  }
  String inputDir = args[0];
  String outputDir = args[1];
  int numOfReducers = 1;

  if (args.length > 3) {
    numOfReducers = Integer.parseInt(args[3]);
  }

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("DomainStatistics: starting at " + sdf.format(start));

  int mode = 0;
  String jobName = "DomainStatistics";
  if(args[2].equals("host")) {
    jobName = "Host statistics";
    mode = MODE_HOST;
  } else if(args[2].equals("domain")) {
    jobName  = "Domain statistics";
    mode = MODE_DOMAIN;
  } else if(args[2].equals("suffix")) {
    jobName = "Suffix statistics";
    mode = MODE_SUFFIX;
  } else if(args[2].equals("tld")) {
    jobName = "TLD statistics";
    mode = MODE_TLD;
  }

  Configuration conf = getConf();
  conf.setInt("domain.statistics.mode", mode);
  conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  Job job = new Job(conf, jobName);
  job.setJarByClass(DomainStatistics.class);

  String[] inputDirsSpecs = inputDir.split(",");
  for (int i = 0; i < inputDirsSpecs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
  }

  job.setInputFormatClass(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(outputDir));
  job.setOutputFormatClass(TextOutputFormat.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);

  job.setMapperClass(DomainStatisticsMapper.class);
  job.setReducerClass(DomainStatisticsReducer.class);
  job.setCombinerClass(DomainStatisticsCombiner.class);
  job.setNumReduceTasks(numOfReducers);

  try {
    job.waitForCompletion(true);
  } catch (Exception e) {
    throw e;
  }

  long end = System.currentTimeMillis();
  LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Example 16

Source File: MapReduceExercise.java From mongodb-hadoop-workshop with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    if(args.length < 3) {
        System.err.println("Usage: MapReduceExercise " +
            "[mongodb input uri] " +
            "[mongodb output uri] " +
            "update=[true or false]");

        System.err.println("Example: MapReduceExercise " +
            "mongodb://127.0.0.1:27017/movielens.ratings " +
            "mongodb://127.0.0.1:27017/movielens.ratings.stats update=false");

        System.err.println("Example: MapReduceExercise " +
            "mongodb://127.0.0.1:27017/movielens.ratings " +
            "mongodb://127.0.0.1:27017/movielens.movies update=true");

        System.exit(-1);
    }

    Class outputValueClass = BSONWritable.class;
    Class reducerClass = Reduce.class;

    if(args[2].equals("update=true")) {
        outputValueClass = MongoUpdateWritable.class;
        reducerClass = ReduceUpdater.class;
    }

    Configuration conf = new Configuration();

    // Set MongoDB-specific configuration items
    conf.setClass("mongo.job.mapper", Map.class, Mapper.class);
    conf.setClass("mongo.job.reducer", reducerClass, Reducer.class);

    conf.setClass("mongo.job.mapper.output.key", IntWritable.class, Object.class);
    conf.setClass("mongo.job.mapper.output.value", DoubleWritable.class, Object.class);

    conf.setClass("mongo.job.output.key", NullWritable.class, Object.class);
    conf.setClass("mongo.job.output.value", outputValueClass, Object.class);

    conf.set("mongo.input.uri",  args[0]);
    conf.set("mongo.output.uri", args[1]);

    Job job = Job.getInstance(conf);

    // Set Hadoop-specific job parameters
    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(outputValueClass);

    job.setMapperClass(Map.class);
    job.setReducerClass(reducerClass);

    job.setJarByClass(MapReduceExercise.class);

    job.submit();
}

Example 17

Source File: HalyardBulkExport.java From Halyard with Apache License 2.0

4 votes

@Override
protected int run(CommandLine cmd) throws Exception {
    if (!cmd.getArgList().isEmpty()) throw new HalyardExport.ExportException("Unknown arguments: " + cmd.getArgList().toString());
    String source = cmd.getOptionValue('s');
    String queryFiles = cmd.getOptionValue('q');
    String target = cmd.getOptionValue('t');
    if (!target.contains("{0}")) {
        throw new HalyardExport.ExportException("Bulk export target must contain '{0}' to be replaced by stripped filename of the actual SPARQL query.");
    }
    getConf().set(SOURCE, source);
    getConf().set(TARGET, target);
    String driver = cmd.getOptionValue('c');
    if (driver != null) {
        getConf().set(JDBC_DRIVER, driver);
    }
    String props[] = cmd.getOptionValues('p');
    if (props != null) {
        for (int i=0; i<props.length; i++) {
            props[i] = Base64.encodeBase64String(props[i].getBytes(StandardCharsets.UTF_8));
        }
        getConf().setStrings(JDBC_PROPERTIES, props);
    }
    if (cmd.hasOption('i')) getConf().set(HalyardBulkUpdate.ELASTIC_INDEX_URL, cmd.getOptionValue('i'));
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           NTriplesUtil.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    String cp = cmd.getOptionValue('l');
    if (cp != null) {
        String jars[] = cp.split(":");
        StringBuilder newCp = new StringBuilder();
        for (int i=0; i<jars.length; i++) {
            if (i > 0) newCp.append(':');
            newCp.append(addTmpFile(jars[i])); //append clappspath entris to tmpfiles and trim paths from the classpath
        }
        getConf().set(JDBC_CLASSPATH, newCp.toString());
    }
    Job job = Job.getInstance(getConf(), "HalyardBulkExport " + source + " -> " + target);
    job.setJarByClass(HalyardBulkExport.class);
    job.setMaxMapAttempts(1);
    job.setMapperClass(BulkExportMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Void.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(QueryInputFormat.class);
    QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, false, 0);
    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initCredentials(job);
    if (job.waitForCompletion(true)) {
        LOG.info("Bulk Export Completed..");
        return 0;
    }
    return -1;
}

Example 18

Source File: BlurOutputFormatMiniClusterTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException, BlurException,
    TException {
  fileSystem.delete(inDir, true);
  String tableName = "testBlurOutputFormat";
  writeRecordsFile("in/part1", 1, 1, 1, 1, "cf1");
  writeRecordsFile("in/part2", 1, 1, 2, 1, "cf1");

  Job job = Job.getInstance(conf, "blur index");
  job.setJarByClass(BlurOutputFormatMiniClusterTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  String tableUri = new Path(TEST_ROOT_DIR + "/blur/" + tableName).makeQualified(fileSystem.getUri(),
      fileSystem.getWorkingDirectory()).toString();
  CsvBlurMapper.addColumns(job, "cf1", "col");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(1);
  tableDescriptor.setTableUri(tableUri);
  tableDescriptor.setName(tableName);

  Iface client = getClient();
  client.createTable(tableDescriptor);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  Path output = new Path(TEST_ROOT_DIR + "/out");
  BlurOutputFormat.setOutputPath(job, output);

  Path tablePath = new Path(tableUri);
  Path shardPath = new Path(tablePath, ShardUtil.getShardName(0));
  FileStatus[] listStatus = fileSystem.listStatus(shardPath);
  
  System.out.println("======" + listStatus.length);
  for (FileStatus fileStatus : listStatus) {
    System.out.println(fileStatus.getPath());
  }
  assertEquals(3, listStatus.length);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  client.loadData(tableName, output.toString());

  while (true) {
    TableStats tableStats = client.tableStats(tableName);
    System.out.println(tableStats);
    if (tableStats.getRowCount() > 0) {
      break;
    }
    Thread.sleep(100);
  }

  assertTrue(fileSystem.exists(tablePath));
  assertFalse(fileSystem.isFile(tablePath));

  FileStatus[] listStatusAfter = fileSystem.listStatus(shardPath);

  assertEquals(12, listStatusAfter.length);

}

Example 19

Source File: HadoopSegmentCreationJob.java From incubator-pinot with Apache License 2.0

4 votes

public void run()
    throws Exception {
  _logger.info("Starting {}", getClass().getSimpleName());

  // Initialize all directories
  _outputDirFileSystem = FileSystem.get(new Path(_outputDir).toUri(), getConf());
  JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_outputDir), _defaultPermissionsMask);
  JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_stagingDir), _defaultPermissionsMask);
  Path stagingInputDir = new Path(_stagingDir, "input");
  JobPreparationHelper.mkdirs(_outputDirFileSystem, stagingInputDir, _defaultPermissionsMask);

  // Gather all data files
  List<Path> dataFilePaths = getDataFilePaths(_inputPattern);
  int numDataFiles = dataFilePaths.size();
  if (numDataFiles == 0) {
    String errorMessage = "No data file founded with pattern: " + _inputPattern;
    _logger.error(errorMessage);
    throw new RuntimeException(errorMessage);
  } else {
    _logger.info("Creating segments with data files: {}", dataFilePaths);
    for (int i = 0; i < numDataFiles; i++) {
      Path dataFilePath = dataFilePaths.get(i);
      try (DataOutputStream dataOutputStream = _outputDirFileSystem
          .create(new Path(stagingInputDir, Integer.toString(i)))) {
        dataOutputStream.write(StringUtil.encodeUtf8(dataFilePath.toString() + " " + i));
        dataOutputStream.flush();
      }
    }
  }

  // Set up the job
  Job job = Job.getInstance(getConf());
  job.setJarByClass(getClass());
  job.setJobName(getClass().getName());

  Configuration jobConf = job.getConfiguration();
  String hadoopTokenFileLocation = System.getenv("HADOOP_TOKEN_FILE_LOCATION");
  if (hadoopTokenFileLocation != null) {
    jobConf.set("mapreduce.job.credentials.binary", hadoopTokenFileLocation);
  }
  jobConf.setInt(JobContext.NUM_MAPS, numDataFiles);

  // Set table config and schema
  TableConfig tableConfig = getTableConfig();
  if (tableConfig != null) {
    validateTableConfig(tableConfig);
    jobConf.set(JobConfigConstants.TABLE_CONFIG, tableConfig.toJsonString());
  }
  jobConf.set(JobConfigConstants.SCHEMA, getSchema().toSingleLineJsonString());

  // Set additional configurations
  for (Map.Entry<Object, Object> entry : _properties.entrySet()) {
    jobConf.set(entry.getKey().toString(), entry.getValue().toString());
  }

  job.setMapperClass(getMapperClass());
  job.setNumReduceTasks(0);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.addInputPath(job, stagingInputDir);
  FileOutputFormat.setOutputPath(job, new Path(_stagingDir, "output"));

  addDepsJarToDistributedCache(job);
  addAdditionalJobProperties(job);

  // Submit the job
  job.waitForCompletion(true);
  if (!job.isSuccessful()) {
    throw new RuntimeException("Job failed: " + job);
  }

  moveSegmentsToOutputDir();

  // Delete the staging directory
  _logger.info("Deleting the staging directory: {}", _stagingDir);
  _outputDirFileSystem.delete(new Path(_stagingDir), true);
}

Example 20

Source File: DBInputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0

3 votes

/**
 * Initializes the map-part of the job with the appropriate input settings.
 *
 * @param job The map-reduce job
 * @param inputClass the class object implementing DBWritable, which is the
 * Java object holding tuple fields.
 * @param inputQuery the input query to select fields. Example :
 * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
 * @param inputCountQuery the input query that returns
 * the number of records in the table.
 * Example : "SELECT COUNT(f1) FROM Mytable"
 * @see #setInput(Job, Class, String, String, String, String...)
 */
public static void setInput(Job job,
    Class<? extends DBWritable> inputClass,
    String inputQuery, String inputCountQuery) {
  job.setInputFormatClass(DBInputFormat.class);
  DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
  dbConf.setInputClass(inputClass);
  dbConf.setInputQuery(inputQuery);
  dbConf.setInputCountQuery(inputCountQuery);
}