Java Code Examples for org.apache.hadoop.mapred.JobConf#setInputFormat()

The following examples show how to use org.apache.hadoop.mapred.JobConf#setInputFormat() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: NNBench.java From hadoop with Apache License 2.0

8 votes

/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests() throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}

Example 2

Source File: SliveTest.java From big-c with Apache License 2.0

6 votes

/**
 * Sets up a job conf for the given job using the given config object. Ensures
 * that the correct input format is set, the mapper and and reducer class and
 * the input and output keys and value classes along with any other job
 * configuration.
 * 
 * @param config
 * @return JobConf representing the job to be ran
 * @throws IOException
 */
private JobConf getJob(ConfigExtractor config) throws IOException {
  JobConf job = new JobConf(config.getConfig(), SliveTest.class);
  job.setInputFormat(DummyInputFormat.class);
  FileOutputFormat.setOutputPath(job, config.getOutputPath());
  job.setMapperClass(SliveMapper.class);
  job.setPartitionerClass(SlivePartitioner.class);
  job.setReducerClass(SliveReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setOutputFormat(TextOutputFormat.class);
  TextOutputFormat.setCompressOutput(job, false);
  job.setNumReduceTasks(config.getReducerAmount());
  job.setNumMapTasks(config.getMapAmount());
  return job;
}

Example 3

Source File: DistCpV1.java From big-c with Apache License 2.0

6 votes

private static JobConf createJobConf(Configuration conf) {
  JobConf jobconf = new JobConf(conf, DistCpV1.class);
  jobconf.setJobName(conf.get("mapred.job.name", NAME));

  // turn off speculative execution, because DFS doesn't handle
  // multiple writers to the same file.
  jobconf.setMapSpeculativeExecution(false);

  jobconf.setInputFormat(CopyInputFormat.class);
  jobconf.setOutputKeyClass(Text.class);
  jobconf.setOutputValueClass(Text.class);

  jobconf.setMapperClass(CopyFilesMapper.class);
  jobconf.setNumReduceTasks(0);
  return jobconf;
}

Example 4

Source File: NNBench.java From RDFS with Apache License 2.0

6 votes

/**
 * Run the test
 * 
 * @throws IOException on error
 */
public static void runTests(Configuration config) throws IOException {
  config.setLong("io.bytes.per.checksum", bytesPerChecksum);
  
  JobConf job = new JobConf(config, NNBench.class);

  job.setJobName("NNBench-" + operation);
  FileInputFormat.setInputPaths(job, new Path(baseDir, CONTROL_DIR_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  
  // Explicitly set number of max map attempts to 1.
  job.setMaxMapAttempts(1);
  
  // Explicitly turn off speculative execution
  job.setSpeculativeExecution(false);

  job.setMapperClass(NNBenchMapper.class);
  job.setReducerClass(NNBenchReducer.class);

  FileOutputFormat.setOutputPath(job, new Path(baseDir, OUTPUT_DIR_NAME));
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setNumReduceTasks((int) numberOfReduces);
  JobClient.runJob(job);
}

Example 5

Source File: ReadDataJob.java From tracing-framework with BSD 3-Clause "New" or "Revised" License

6 votes

public void configure(JobConf job) {
    // Set the mapper and reducers
    job.setMapperClass(TestMapper.class);
    // job.setReducerClass(TestReducer.class);

    // Set the output types of the mapper and reducer
    // job.setMapOutputKeyClass(IntWritable.class);
    // job.setMapOutputValueClass(NullWritable.class);
    // job.setOutputKeyClass(NullWritable.class);
    // job.setOutputValueClass(NullWritable.class);

    // Make sure this jar is included
    job.setJarByClass(TestMapper.class);

    // Specify the input and output data formats
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);

    // Turn off speculative execution
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    // Add the job input path
    FileInputFormat.addInputPath(job, new Path(this.input_filename));
}

Example 6

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

5 votes

public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException {
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb dump: starting");
    LOG.info("CrawlDb db: " + crawlDb);
  }

  Path outFolder = new Path(output);

  JobConf job = new NutchJob(config);
  job.setJobName("dump " + crawlDb);

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, outFolder);

  if (format.equals("csv")) {
    job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
  }
  else if (format.equals("crawldb")) {
    job.setOutputFormat(MapFileOutputFormat.class);
  } else {
    job.setOutputFormat(TextOutputFormat.class);
  }

  if (status != null) job.set("status", status);
  if (regex != null) job.set("regex", regex);
  if (retry != null) job.setInt("retry", retry);
  
  job.setMapperClass(CrawlDbDumpMapper.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);

  JobClient.runJob(job);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
}

Example 7

Source File: HadoopWordCount1.java From ignite with Apache License 2.0

5 votes

/**
 * Sets task classes with related info if needed into configuration object.
 *
 * @param jobConf Configuration to change.
 * @param setMapper Option to set mapper and input format classes.
 * @param setCombiner Option to set combiner class.
 * @param setReducer Option to set reducer and output format classes.
 */
public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner, boolean setReducer) {
    if (setMapper) {
        jobConf.setMapperClass(HadoopWordCount1Map.class);
        jobConf.setInputFormat(TextInputFormat.class);
    }

    if (setCombiner)
        jobConf.setCombinerClass(HadoopWordCount1Reduce.class);

    if (setReducer) {
        jobConf.setReducerClass(HadoopWordCount1Reduce.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
    }
}

Example 8

Source File: MultipleInputs.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Add a {@link Path} with a custom {@link InputFormat} to the list of
 * inputs for the map-reduce job.
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path,
    Class<? extends InputFormat> inputFormatClass) {

  String inputFormatMapping = path.toString() + ";"
     + inputFormatClass.getName();
  String inputFormats = conf.get("mapred.input.dir.formats");
  conf.set("mapred.input.dir.formats",
     inputFormats == null ? inputFormatMapping : inputFormats + ","
         + inputFormatMapping);

  conf.setInputFormat(DelegatingInputFormat.class);
}

Example 9

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

5 votes

/**
 * Use this before submitting a TableMap job. It will
 * appropriately set up the JobConf.
 *
 * @param table  The table name to read from.
 * @param columns  The columns to scan.
 * @param mapper  The mapper class to use.
 * @param outputKeyClass  The class of the output key.
 * @param outputValueClass  The class of the output value.
 * @param job  The current job configuration to adjust.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 */
public static void initTableMapJob(String table, String columns,
  Class<? extends TableMap> mapper,
  Class<?> outputKeyClass,
  Class<?> outputValueClass, JobConf job, boolean addDependencyJars,
  Class<? extends InputFormat> inputFormat) {

  job.setInputFormat(inputFormat);
  job.setMapOutputValueClass(outputValueClass);
  job.setMapOutputKeyClass(outputKeyClass);
  job.setMapperClass(mapper);
  job.setStrings("io.serializations", job.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName());
  FileInputFormat.addInputPaths(job, table);
  job.set(TableInputFormat.COLUMN_LIST, columns);
  if (addDependencyJars) {
    try {
      addDependencyJars(job);
    } catch (IOException e) {
      LOG.error("IOException encountered while adding dependency jars", e);
    }
  }
  try {
    initCredentials(job);
  } catch (IOException ioe) {
    // just spit out the stack trace?  really?
    LOG.error("IOException encountered while initializing credentials", ioe);
  }
}

Example 10

Source File: BusyLegs.java From gemfirexd-oss with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("Busy Leg Count");

    Path outputPath = new Path(args[0]);
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    // Configure Mapper
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(SampleMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    // Configure Reducer
    conf.setReducerClass(SampleReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(conf, outputPath);

    JobClient.runJob(conf);
    return 0;
  }

Example 11

Source File: TestMRAppWithCombiner.java From hadoop with Apache License 2.0

5 votes

@Test
public void testCombinerShouldUpdateTheReporter() throws Exception {
  JobConf conf = new JobConf(mrCluster.getConfig());
  int numMaps = 5;
  int numReds = 2;
  Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-in");
  Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(),
      "testCombinerShouldUpdateTheReporter-out");
  createInputOutPutFolder(in, out, numMaps);
  conf.setJobName("test-job-with-combiner");
  conf.setMapperClass(IdentityMapper.class);
  conf.setCombinerClass(MyCombinerToCheckReporter.class);
  //conf.setJarByClass(MyCombinerToCheckReporter.class);
  conf.setReducerClass(IdentityReducer.class);
  DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf);
  conf.setOutputCommitter(CustomOutputCommitter.class);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputKeyClass(LongWritable.class);
  conf.setOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(conf, in);
  FileOutputFormat.setOutputPath(conf, out);
  conf.setNumMapTasks(numMaps);
  conf.setNumReduceTasks(numReds);
  
  runJob(conf);
}

Example 12

Source File: LinkRank.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}

Example 13

Source File: MapUtils.java From incubator-tez with Apache License 2.0

5 votes

public static LogicalIOProcessorRuntimeTask createLogicalTask(FileSystem fs, Path workDir,
    JobConf jobConf, int mapId, Path mapInput,
    TezUmbilical umbilical, String dagName,
    String vertexName, List<InputSpec> inputSpecs,
    List<OutputSpec> outputSpecs) throws Exception {
  jobConf.setInputFormat(SequenceFileInputFormat.class);

  ProcessorDescriptor mapProcessorDesc = new ProcessorDescriptor(
      MapProcessor.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));
  
  Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>();

  TaskSpec taskSpec = new TaskSpec(
      TezTestUtils.getMockTaskAttemptId(0, 0, mapId, 0),
      dagName, vertexName,
      mapProcessorDesc,
      inputSpecs,
      outputSpecs, null);

  Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
  serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID,
      ShuffleUtils.convertJobTokenToBytes(shuffleToken));
  
  LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(
      taskSpec,
      0,
      jobConf,
      new String[] {workDir.toString()},
      umbilical,
      serviceConsumerMetadata,
      HashMultimap.<String, String>create());
  return task;
}

Example 14

Source File: DataJoinJob.java From big-c with Apache License 2.0

4 votes

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
      System.out.println("Using TextInputFormat: " + args[2]);
      inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileOutputFormat: " + args[7]);
      outputFormat = SequenceFileOutputFormat.class;
      outputValueClass = getClassByName(args[7]);
    } else {
      System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
      maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
      jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir), true);
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job,
            SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
  }

Example 15

Source File: TradeBuyOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeBuyOrdersHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeBuyOrdersHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeBuyOrdersRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeBuyOrdersOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Example 16

Source File: TestMultiMRInput.java From incubator-tez with Apache License 2.0

4 votes

@Test(timeout = 5000)
public void testMultipleSplits() throws Exception {

  Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
  JobConf jobConf = new JobConf(defaultConf);
  jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
  FileInputFormat.setInputPaths(jobConf, workDir);

  MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
  builder.setInputFormatName(SequenceFileInputFormat.class.getName());
  builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
  byte[] payload = builder.build().toByteArray();

  TezInputContext inputContext = createTezInputContext(payload);

  MultiMRInput input = new MultiMRInput();
  input.setNumPhysicalInputs(2);
  input.initialize(inputContext);
  List<Event> eventList = new ArrayList<Event>();

  LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>();

  String file1 = "file1";
  LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0,
      10);

  String file2 = "file2";
  LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10,
      20);

  data.putAll(data1);
  data.putAll(data2);

  SequenceFileInputFormat<LongWritable, Text> format =
      new SequenceFileInputFormat<LongWritable, Text>();
  InputSplit[] splits = format.getSplits(jobConf, 2);
  assertEquals(2, splits.length);

  MRSplitProto splitProto1 = MRHelpers.createSplitProto(splits[0]);
  RootInputDataInformationEvent event1 = new RootInputDataInformationEvent(0,
      splitProto1.toByteArray());

  MRSplitProto splitProto2 = MRHelpers.createSplitProto(splits[1]);
  RootInputDataInformationEvent event2 = new RootInputDataInformationEvent(0,
      splitProto2.toByteArray());

  eventList.clear();
  eventList.add(event1);
  eventList.add(event2);
  input.handleEvents(eventList);

  int readerCount = 0;
  for (KeyValueReader reader : input.getKeyValueReaders()) {
    readerCount++;
    while (reader.next()) {
      if (data.size() == 0) {
        fail("Found more records than expected");
      }
      Object key = reader.getCurrentKey();
      Object val = reader.getCurrentValue();
      assertEquals(val, data.remove(key));
    }
  }
  assertEquals(2, readerCount);
}

Example 17

Source File: Submitter.java From RDFS with Apache License 2.0

4 votes

private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, "mapred.mapoutput.key.class", textClassname);
  setIfUnset(conf, "mapred.mapoutput.value.class", textClassname);
  setIfUnset(conf, "mapred.output.key.class", textClassname);
  setIfUnset(conf, "mapred.output.value.class", textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass("mapred.pipes.user.inputformat", 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    DistributedCache.createSymlink(conf);
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_HOME/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf,"mapred.map.task.debug.script",defScript);
    setIfUnset(conf,"mapred.reduce.task.debug.script",defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}

Example 18

Source File: TradeCustomersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeCustomersHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeCustomersHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeCustomersRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeCustomerOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Example 19

Source File: FreeGenerator.java From anthelion with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }
  
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Example 20

Source File: ResultMergeRemoteSpark.java From systemds with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int blen)
{
	String jobname = "ParFor-RMSP";
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	
	SparkExecutionContext sec = (SparkExecutionContext)_ec;
	boolean withCompare = (compare!=null);

	RDDObject ret = null;
	
	//determine degree of parallelism
	int numRed = determineNumReducers(rlen, clen, blen, _numReducers);
	
	//sanity check for empty src files
	if( inputs == null || inputs.length==0  )
		throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
	
	try
	{
		//note: initial implementation via union over all result rdds discarded due to 
		//stack overflow errors with many parfor tasks, and thus many rdds
		
		//Step 1: construct input rdd from all result files of parfor workers
		//a) construct job conf with all files
		InputOutputInfo ii = InputOutputInfo.get(DataType.MATRIX, FileFormat.BINARY);
		JobConf job = new JobConf( "test" );
		job.setJobName(jobname);
		job.setInputFormat(ii.inputFormatClass);
		Path[] paths = new Path[ inputs.length ];
		for(int i=0; i<paths.length; i++) {
			//ensure input exists on hdfs (e.g., if in-memory or RDD)
			inputs[i].exportData();
			paths[i] = new Path( inputs[i].getFileName() );
			//update rdd handle to allow lazy evaluation by guarding 
			//against cleanup of temporary result files
			setRDDHandleForMerge(inputs[i], sec);
		}
		FileInputFormat.setInputPaths(job, paths);
		
		//b) create rdd from input files w/ deep copy of keys and blocks
		JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext()
				.hadoopRDD(job, ii.inputFormatClass, ii.keyClass, ii.valueClass)
				.mapPartitionsToPair(new CopyMatrixBlockPairFunction(true), true);
		
		//Step 2a: merge with compare
		JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
		if( withCompare ) {
			JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) 
					sec.getRDDHandleForMatrixObject(compare, FileFormat.BINARY);
			
			//merge values which differ from compare values
			ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
			out = rdd.groupByKey(numRed) //group all result blocks per key
				.join(compareRdd)        //join compare block and result blocks 
				.mapToPair(cfun);        //merge result blocks w/ compare
		}
		//Step 2b: merge without compare
		else {
			//direct merge in any order (disjointness guaranteed)
			out = _isAccum ?
				RDDAggregateUtils.sumByKeyStable(rdd, false) :
				RDDAggregateUtils.mergeByKey(rdd, false);
		}
		
		//Step 3: create output rdd handle w/ lineage
		ret = new RDDObject(out);
		for(int i=0; i<paths.length; i++)
			ret.addLineageChild(inputs[i].getRDDHandle());
		if( withCompare )
			ret.addLineageChild(compare.getRDDHandle());
	}
	catch( Exception ex ) {
		throw new DMLRuntimeException(ex);
	}
	
	//maintain statistics
	Statistics.incrementNoOfCompiledSPInst();
	Statistics.incrementNoOfExecutedSPInst();
	if( DMLScript.STATISTICS ){
		Statistics.maintainCPHeavyHitters(jobname, System.nanoTime()-t0);
	}
	
	return ret;
}