org.apache.tez.mapreduce.hadoop.MRJobConfig Java Examples

The following examples show how to use org.apache.tez.mapreduce.hadoop.MRJobConfig. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TezDagBuilder.java    From spork with Apache License 2.0 6 votes vote down vote up
private void addCombiner(PhysicalPlan combinePlan, TezOperator pkgTezOp,
        Configuration conf) throws IOException {
    POPackage combPack = (POPackage) combinePlan.getRoots().get(0);
    POLocalRearrange combRearrange = (POLocalRearrange) combinePlan
            .getLeaves().get(0);
    setIntermediateOutputKeyValue(combRearrange.getKeyType(), conf, pkgTezOp);

    LoRearrangeDiscoverer lrDiscoverer = new LoRearrangeDiscoverer(
            combinePlan, pkgTezOp, combPack);
    lrDiscoverer.visit();

    combinePlan.remove(combPack);
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS,
            MRCombiner.class.getName());
    conf.set(MRJobConfig.COMBINE_CLASS_ATTR,
            PigCombiner.Combine.class.getName());
    conf.setBoolean(MRConfiguration.MAPPER_NEW_API, true);
    conf.set("pig.pigContext", ObjectSerializer.serialize(pc));
    conf.set("udf.import.list",
            ObjectSerializer.serialize(PigContext.getPackageImportList()));
    conf.set("pig.combinePlan", ObjectSerializer.serialize(combinePlan));
    conf.set("pig.combine.package", ObjectSerializer.serialize(combPack));
    conf.set("pig.map.keytype", ObjectSerializer
            .serialize(new byte[] { combRearrange.getKeyType() }));
}
 
Example #2
Source File: TestMRCombiner.java    From tez with Apache License 2.0 6 votes vote down vote up
@Test
public void testTop2RunNewCombiner() throws IOException, InterruptedException {
  TezConfiguration conf = new TezConfiguration();
  setKeyAndValueClassTypes(conf);
  conf.setBoolean("mapred.mapper.new-api", true);
  conf.setClass(MRJobConfig.COMBINE_CLASS_ATTR, Top2NewReducer.class,
      Object.class);
  TaskContext taskContext = getTaskContext(conf);
  MRCombiner combiner = new MRCombiner(taskContext);
  Writer writer = Mockito.mock(Writer.class);
  combiner.combine(new TezRawKeyValueIteratorTest(), writer);
  long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
  long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
  assertEquals(6, inputRecords);
  assertEquals(5, outputRecords);
}
 
Example #3
Source File: TestMRCombiner.java    From tez with Apache License 2.0 6 votes vote down vote up
@Test
public void testRunNewCombiner() throws IOException, InterruptedException {
  TezConfiguration conf = new TezConfiguration();
  setKeyAndValueClassTypes(conf);
  conf.setBoolean("mapred.mapper.new-api", true);
  conf.setClass(MRJobConfig.COMBINE_CLASS_ATTR, NewReducer.class,
      Object.class);
  TaskContext taskContext = getTaskContext(conf);
  MRCombiner combiner = new MRCombiner(taskContext);
  Writer writer = Mockito.mock(Writer.class);
  combiner.combine(new TezRawKeyValueIteratorTest(), writer);
  long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
  long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
  assertEquals(6, inputRecords);
  assertEquals(3, outputRecords);
  // verify combiner output keys and values
  verifyKeyAndValues(writer);
}
 
Example #4
Source File: MROutput.java    From tez with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the user payload to be set on the OutputDescriptor for MROutput
 */
private UserPayload createUserPayload() {
  // set which api is being used always
  conf.setBoolean(MRJobConfig.NEW_API_REDUCER_CONFIG, useNewApi);
  conf.setBoolean(MRJobConfig.NEW_API_MAPPER_CONFIG, useNewApi);
  if (outputFormatProvided) {
    if (useNewApi) {
      conf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormat.getName());
    } else {
      conf.set("mapred.output.format.class", outputFormat.getName());
    }
  }
  MRHelpers.translateMRConfToTez(conf);
  try {
    return TezUtils.createUserPayloadFromConf(conf);
  } catch (IOException e) {
    throw new TezUncheckedException(e);
  }
}
 
Example #5
Source File: MROutput.java    From tez with Apache License 2.0 6 votes vote down vote up
private MROutputConfigBuilder setOutputPath(String outputPath) {
  boolean passNewLazyOutputFormatCheck =
      (LazyOutputFormat.class.isAssignableFrom(outputFormat)) &&
      org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.
          isAssignableFrom(conf.getClass(
              MRJobConfig.LAZY_OUTPUTFORMAT_OUTPUTFORMAT, null));
  boolean passOldLazyOutputFormatCheck =
      (org.apache.hadoop.mapred.lib.LazyOutputFormat.class.
          isAssignableFrom(outputFormat)) &&
      FileOutputFormat.class.isAssignableFrom(conf.getClass(
          MRJobConfig.LAZY_OUTPUTFORMAT_OUTPUTFORMAT, null));

  if (!(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.
      isAssignableFrom(outputFormat) ||
      FileOutputFormat.class.isAssignableFrom(outputFormat) ||
      passNewLazyOutputFormatCheck || passOldLazyOutputFormatCheck)) {
    throw new TezUncheckedException("When setting outputPath the outputFormat must " +
        "be assignable from either org.apache.hadoop.mapred.FileOutputFormat or " +
        "org.apache.hadoop.mapreduce.lib.output.FileOutputFormat. " +
        "Otherwise use the non-path config builder." +
        " Given: " + outputFormat.getName());
  }
  conf.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR, outputPath);
  this.outputPath = outputPath;
  return this;
}
 
Example #6
Source File: MultiMROutput.java    From tez with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private synchronized RecordWriter getNewRecordWriter(
    TaskAttemptContext taskContext, String baseFileName)
    throws IOException, InterruptedException {

  // look for record-writer in the cache
  RecordWriter writer = newRecordWriters.get(baseFileName);

  // If not in cache, create a new one
  if (writer == null) {
    // get the record writer from context output format
    taskContext.getConfiguration().set(
        MRJobConfig.FILEOUTPUTFORMAT_BASE_OUTPUT_NAME, baseFileName);
    try {
      writer = ((OutputFormat) ReflectionUtils.newInstance(
          taskContext.getOutputFormatClass(), taskContext.getConfiguration()))
          .getRecordWriter(taskContext);
    } catch (ClassNotFoundException e) {
      throw new IOException(e);
    }
    // add the record-writer to the cache
    newRecordWriters.put(baseFileName, writer);
  }
  return writer;
}
 
Example #7
Source File: MRTask.java    From tez with Apache License 2.0 6 votes vote down vote up
public void localizeConfiguration(JobConf jobConf)
    throws IOException, InterruptedException {
  jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
  jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
  jobConf.setInt(JobContext.TASK_PARTITION,
      taskAttemptId.getTaskID().getId());
  jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());
  
  jobConf.setBoolean(MRJobConfig.TASK_ISMAP, isMap);
  
  Path outputPath = FileOutputFormat.getOutputPath(jobConf);
  if (outputPath != null) {
    if ((committer instanceof FileOutputCommitter)) {
      FileOutputFormat.setWorkOutputPath(jobConf, 
        ((FileOutputCommitter)committer).getTaskAttemptPath(taskAttemptContext));
    } else {
      FileOutputFormat.setWorkOutputPath(jobConf, outputPath);
    }
  }
}
 
Example #8
Source File: SplitMetaInfoReaderTez.java    From tez with Apache License 2.0 6 votes vote down vote up
public static TaskSplitMetaInfo[] readSplitMetaInfo(Configuration conf,
    FileSystem fs) throws IOException {
  FSDataInputStream in = null;
  try {
    in = getFSDataIS(conf, fs);
    final String jobSplitFile = MRJobConfig.JOB_SPLIT;
    final String basePath = conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, ".");
    int numSplits = WritableUtils.readVInt(in); // TODO: check for insane values
    JobSplit.TaskSplitMetaInfo[] allSplitMetaInfo = new JobSplit.TaskSplitMetaInfo[numSplits];
    for (int i = 0; i < numSplits; i++) {
      JobSplit.SplitMetaInfo splitMetaInfo = new JobSplit.SplitMetaInfo();
      splitMetaInfo.readFields(in);
      JobSplit.TaskSplitIndex splitIndex = new JobSplit.TaskSplitIndex(
          new Path(basePath, jobSplitFile)
              .toUri().toString(), splitMetaInfo.getStartOffset());
      allSplitMetaInfo[i] = new JobSplit.TaskSplitMetaInfo(splitIndex,
          splitMetaInfo.getLocations(), splitMetaInfo.getInputDataLength());
    }
    return allSplitMetaInfo;
  } finally {
    if (in != null) {
      in.close();
    }
  }
}
 
Example #9
Source File: MRInput.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
/**
 * Helper API to generate the user payload for the MRInput and
 * MRInputAMSplitGenerator (if used). The InputFormat will be invoked by Tez
 * at DAG runtime to generate the input splits.
 * 
 * @param conf
 *          Configuration for the InputFormat
 * @param inputFormatClassName
 *          Name of the class of the InputFormat
 * @param useNewApi
 *          use new mapreduce API or old mapred API
 * @param groupSplitsInAM
 *          do grouping of splits in the AM. If true then splits generated by
 *          the InputFormat will be grouped in the AM based on available
 *          resources, locality etc. This option may be set to true only when
 *          using MRInputAMSplitGenerator as the initializer class in
 *          {@link Vertex#addInput(String, org.apache.tez.dag.api.InputDescriptor, Class)}
 * @return returns the user payload to be set on the InputDescriptor of  MRInput
 * @throws IOException
 */
public static byte[] createUserPayload(Configuration conf,
    String inputFormatClassName, boolean useNewApi, boolean groupSplitsInAM)
    throws IOException {
  Configuration inputConf = new JobConf(conf);
  String wrappedInputFormatClassName = null;
  String configInputFormatClassName = null;
  if (groupSplitsInAM) {
    wrappedInputFormatClassName = inputFormatClassName;
    configInputFormatClassName = TezGroupedSplitsInputFormat.class.getName();
  } else {
    wrappedInputFormatClassName = null;
    configInputFormatClassName = inputFormatClassName;
  }
  inputConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR,
      configInputFormatClassName);
  inputConf.setBoolean("mapred.mapper.new-api", useNewApi);
  MRHelpers.translateVertexConfToTez(inputConf);
  MRHelpers.doJobClientMagic(inputConf);
  if (groupSplitsInAM) {
    return MRHelpers.createMRInputPayloadWithGrouping(inputConf,
        wrappedInputFormatClassName);
  } else {
    return MRHelpers.createMRInputPayload(inputConf, null);
  }
}
 
Example #10
Source File: MRTask.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
public void localizeConfiguration(JobConf jobConf)
    throws IOException, InterruptedException {
  jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
  jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
  jobConf.setInt(JobContext.TASK_PARTITION,
      taskAttemptId.getTaskID().getId());
  jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());
  
  jobConf.setBoolean(MRJobConfig.TASK_ISMAP, isMap);
  
  Path outputPath = FileOutputFormat.getOutputPath(jobConf);
  if (outputPath != null) {
    if ((committer instanceof FileOutputCommitter)) {
      FileOutputFormat.setWorkOutputPath(jobConf, 
        ((FileOutputCommitter)committer).getTaskAttemptPath(taskAttemptContext));
    } else {
      FileOutputFormat.setWorkOutputPath(jobConf, outputPath);
    }
  }
}
 
Example #11
Source File: MRPartitioner.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public MRPartitioner(Configuration conf) {
  this.useNewApi = ConfigUtils.useNewApi(conf);
  int partitions = conf.getInt(TezRuntimeFrameworkConfigs.TEZ_RUNTIME_NUM_EXPECTED_PARTITIONS, 1);

  if (useNewApi) {
    oldPartitioner = null;
    if (partitions > 1) {
      newPartitioner = (org.apache.hadoop.mapreduce.Partitioner) ReflectionUtils
          .newInstance(
              (Class<? extends org.apache.hadoop.mapreduce.Partitioner<?, ?>>) conf
                  .getClass(MRJobConfig.PARTITIONER_CLASS_ATTR,
                      org.apache.hadoop.mapreduce.lib.partition.HashPartitioner.class), conf);
    } else {
      newPartitioner = new org.apache.hadoop.mapreduce.Partitioner() {
        @Override
        public int getPartition(Object key, Object value, int numPartitions) {
          return numPartitions - 1;
        }
      };
    }
  } else {
    newPartitioner = null;
    if (partitions > 1) {
      oldPartitioner = (org.apache.hadoop.mapred.Partitioner) ReflectionUtils.newInstance(
          (Class<? extends org.apache.hadoop.mapred.Partitioner>) conf.getClass(
              "mapred.partitioner.class", org.apache.hadoop.mapred.lib.HashPartitioner.class), conf);
    } else {
      oldPartitioner = new org.apache.hadoop.mapred.Partitioner() {
        @Override
        public void configure(JobConf job) {
        }

        @Override
        public int getPartition(Object key, Object value, int numPartitions) {
          return numPartitions - 1;
        }
      };
    }
  }
}
 
Example #12
Source File: MROutput.java    From tez with Apache License 2.0 5 votes vote down vote up
protected String getOutputFileNamePrefix() {
  String prefix = jobConf.get(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX);
  if (prefix == null) {
    prefix = "part-v" + 
        nonTaskNumberFormat.format(getContext().getTaskVertexIndex()) +  
        "-o" + nonTaskNumberFormat.format(getContext().getOutputIndex());
  }
  return prefix;
}
 
Example #13
Source File: YARNRunner.java    From tez with Apache License 2.0 5 votes vote down vote up
private void setupMapReduceEnv(Configuration jobConf,
    Map<String, String> environment, boolean isMap) throws IOException {

  if (isMap) {
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAP_JAVA_OPTS,""),
        "map",
        MRJobConfig.MAP_JAVA_OPTS,
        MRJobConfig.MAP_ENV);
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,""),
        "map",
        MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,
        MRJobConfig.MAPRED_ADMIN_USER_ENV);
  } else {
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.REDUCE_JAVA_OPTS,""),
        "reduce",
        MRJobConfig.REDUCE_JAVA_OPTS,
        MRJobConfig.REDUCE_ENV);
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,""),
        "reduce",
        MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,
        MRJobConfig.MAPRED_ADMIN_USER_ENV);
  }

  MRHelpers.updateEnvBasedOnMRTaskEnv(jobConf, environment, isMap);
}
 
Example #14
Source File: SplitMetaInfoReaderTez.java    From tez with Apache License 2.0 5 votes vote down vote up
/**
 * Get the split meta info for the task with a specific index. This method
 * reduces the overhead of creating meta objects below the index of the task.
 *
 * @param conf job configuration.
 * @param fs FileSystem.
 * @param index the index of the task.
 * @return split meta info object of the task.
 * @throws IOException
 */
public static TaskSplitMetaInfo getSplitMetaInfo(Configuration conf,
    FileSystem fs, int index) throws IOException {
  FSDataInputStream in = null;
  try {
    in = getFSDataIS(conf, fs);
    final String jobSplitFile = MRJobConfig.JOB_SPLIT;
    final String basePath =
        conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, ".");
    final int numSplits = WritableUtils.readVInt(in); // TODO: check for insane values
    if (numSplits <= index) {
      throw new IOException("Index is larger than the number of splits");
    }
    JobSplit.SplitMetaInfo splitMetaInfo = new JobSplit.SplitMetaInfo();
    int iter = 0;
    while (iter++ <= index) {
      splitMetaInfo.readFields(in);
    }
    JobSplit.TaskSplitIndex splitIndex = new JobSplit.TaskSplitIndex(
        new Path(basePath, jobSplitFile)
            .toUri().toString(), splitMetaInfo.getStartOffset());
    return new JobSplit.TaskSplitMetaInfo(splitIndex,
        splitMetaInfo.getLocations(), splitMetaInfo.getInputDataLength());
  } finally {
    if (in != null) {
      in.close();
    }
  }
}
 
Example #15
Source File: MRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
MRInputConfigBuilder(Configuration conf, Class<?> inputFormatParam) {
  this.conf = conf;
  if (inputFormatParam != null) {
    inputFormatProvided = true;
    this.inputFormat = inputFormatParam;
    if (org.apache.hadoop.mapred.InputFormat.class.isAssignableFrom(inputFormatParam)) {
      useNewApi = false;
    } else if (org.apache.hadoop.mapreduce.InputFormat.class.isAssignableFrom(inputFormatParam)) {
      useNewApi = true;
    } else {
      throw new TezUncheckedException("inputFormat must be assignable from either " +
          "org.apache.hadoop.mapred.InputFormat or " +
          "org.apache.hadoop.mapreduce.InputFormat" +
          " Given: " + inputFormatParam.getName());
    }
  } else {
    inputFormatProvided = false;
    useNewApi = conf.getBoolean(MRJobConfig.NEW_API_MAPPER_CONFIG, true);
    try {
      if (useNewApi) {
        this.inputFormat = conf.getClassByName(conf.get(MRJobConfig.INPUT_FORMAT_CLASS_ATTR));
        Preconditions.checkState(org.apache.hadoop.mapreduce.InputFormat.class
            .isAssignableFrom(this.inputFormat));
      } else {
        this.inputFormat = conf.getClassByName(conf.get("mapred.input.format.class"));
        Preconditions.checkState(org.apache.hadoop.mapred.InputFormat.class
            .isAssignableFrom(this.inputFormat));
      }
    } catch (ClassNotFoundException e) {
      throw new TezUncheckedException(e);
    }
    initializeInputPath();
  }
}
 
Example #16
Source File: MRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
private void setupBasicConf(Configuration inputConf) {
  if (inputFormatProvided) {
    inputConf.setBoolean(MRJobConfig.NEW_API_MAPPER_CONFIG, useNewApi);
    if (useNewApi) {
      inputConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, inputFormat.getName());
    } else {
      inputConf.set("mapred.input.format.class", inputFormat.getName());
    }
  }
}
 
Example #17
Source File: MRInput.java    From tez with Apache License 2.0 5 votes vote down vote up
@Override
public List<Event> initialize() throws IOException {
  super.initialize();
  getContext().inputIsReady();
  this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS,
      MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT);
  LOG.info(getContext().getSourceVertexName() + " using newmapreduce API=" + useNewApi +
      ", split via event=" + splitInfoViaEvents + ", numPhysicalInputs=" +
      getNumPhysicalInputs());
  initializeInternal();
  return null;
}
 
Example #18
Source File: MRCombiner.java    From tez with Apache License 2.0 5 votes vote down vote up
private void runNewCombiner(final TezRawKeyValueIterator rawIter, final Writer writer) throws InterruptedException, IOException {
  
  RecordWriter recordWriter = new RecordWriter() {

    @Override
    public void write(Object key, Object value) throws IOException,
        InterruptedException {
      writer.append(key, value);
      combineOutputRecordsCounter.increment(1);
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException,
        InterruptedException {
      // Will be closed by whoever invokes the combiner.
    }
  };
  
  Class<? extends org.apache.hadoop.mapreduce.Reducer> reducerClazz = (Class<? extends org.apache.hadoop.mapreduce.Reducer>) conf
      .getClass(MRJobConfig.COMBINE_CLASS_ATTR, null,
          org.apache.hadoop.mapreduce.Reducer.class);
  org.apache.hadoop.mapreduce.Reducer reducer = ReflectionUtils.newInstance(reducerClazz, conf);
  
  org.apache.hadoop.mapreduce.Reducer.Context reducerContext =
      createReduceContext(
          conf,
          mrTaskAttemptID,
          rawIter,
          new MRCounters.MRCounter(combineInputRecordsCounter),
          new MRCounters.MRCounter(combineOutputRecordsCounter),
          recordWriter,
          reporter,
          (RawComparator)comparator,
          keyClass,
          valClass);
  
  reducer.run(reducerContext);
  recordWriter.close(reducerContext);
}
 
Example #19
Source File: MRCombiner.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private void runNewCombiner(final TezRawKeyValueIterator rawIter, final Writer writer) throws InterruptedException, IOException {
  
  RecordWriter recordWriter = new RecordWriter() {

    @Override
    public void write(Object key, Object value) throws IOException,
        InterruptedException {
      writer.append(key, value);
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException,
        InterruptedException {
      // Will be closed by whoever invokes the combiner.
    }
  };
  
  Class<? extends org.apache.hadoop.mapreduce.Reducer> reducerClazz = (Class<? extends org.apache.hadoop.mapreduce.Reducer>) conf
      .getClass(MRJobConfig.COMBINE_CLASS_ATTR, null,
          org.apache.hadoop.mapreduce.Reducer.class);
  org.apache.hadoop.mapreduce.Reducer reducer = ReflectionUtils.newInstance(reducerClazz, conf);
  
  org.apache.hadoop.mapreduce.Reducer.Context reducerContext =
      createReduceContext(
          conf,
          mrTaskAttemptID,
          rawIter,
          new MRCounters.MRCounter(combineInputKeyCounter),
          new MRCounters.MRCounter(combineInputValueCounter),
          recordWriter,
          reporter,
          (RawComparator)comparator,
          keyClass,
          valClass);
  
  reducer.run(reducerContext);
  recordWriter.close(reducerContext);
}
 
Example #20
Source File: MRInput.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@Override
public List<Event> initialize() throws IOException {
  super.initialize();
  getContext().inputIsReady();
  this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS,
      MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT);
  LOG.info("Using New mapreduce API: " + useNewApi
      + ", split information via event: " + splitInfoViaEvents);
  initializeInternal();
  return null;
}
 
Example #21
Source File: MRInputBase.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public List<Event> initialize() throws IOException {
  getContext().requestInitialMemory(0l, null); // mandatory call
  MRRuntimeProtos.MRInputUserPayloadProto mrUserPayload =
      MRHelpers.parseMRInputPayload(getContext().getUserPayload());
  Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
      "Split information not expected in " + this.getClass().getName());
  Configuration conf = MRHelpers.createConfFromByteString(mrUserPayload.getConfigurationBytes());

  this.jobConf = new JobConf(conf);
  // Add tokens to the jobConf - in case they are accessed within the RR / IF
  jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());

  TaskAttemptID taskAttemptId = new TaskAttemptID(
      new TaskID(
          Long.toString(getContext().getApplicationId().getClusterTimestamp()),
          getContext().getApplicationId().getId(), TaskType.MAP,
          getContext().getTaskIndex()),
      getContext().getTaskAttemptNumber());

  jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
      taskAttemptId.toString());
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      getContext().getDAGAttemptNumber());

  this.inputRecordCounter = getContext().getCounters().findCounter(
      TaskCounter.INPUT_RECORDS_PROCESSED);

  useNewApi = this.jobConf.getUseNewMapper();
  return null;
}
 
Example #22
Source File: TestReduceProcessor.java    From tez with Apache License 2.0 5 votes vote down vote up
public void setUpJobConf(JobConf job) {
  job.set(TezRuntimeFrameworkConfigs.LOCAL_DIRS, workDir.toString());
  job.set(MRConfig.LOCAL_DIR, workDir.toString());
  job.setClass(
      Constants.TEZ_RUNTIME_TASK_OUTPUT_MANAGER,
      TezTaskOutputFiles.class,
      TezTaskOutput.class);
  job.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
  job.setNumReduceTasks(1);
  job.setInt(MRJobConfig.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
}
 
Example #23
Source File: YARNRunner.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private void setupMapReduceEnv(Configuration jobConf,
    Map<String, String> environment, boolean isMap) throws IOException {

  if (isMap) {
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAP_JAVA_OPTS,""),
        "map",
        MRJobConfig.MAP_JAVA_OPTS,
        MRJobConfig.MAP_ENV);
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,""),
        "map",
        MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,
        MRJobConfig.MAPRED_ADMIN_USER_ENV);
  } else {
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.REDUCE_JAVA_OPTS,""),
        "reduce",
        MRJobConfig.REDUCE_JAVA_OPTS,
        MRJobConfig.REDUCE_ENV);
    warnForJavaLibPath(
        jobConf.get(MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,""),
        "reduce",
        MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,
        MRJobConfig.MAPRED_ADMIN_USER_ENV);
  }

  MRHelpers.updateEnvironmentForMRTasks(jobConf, environment, isMap);
}
 
Example #24
Source File: MROutput.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
/**
 * Creates the user payload to be set on the OutputDescriptor for MROutput
 * @param conf Configuration for the OutputFormat
 * @param outputFormatName Name of the class of the OutputFormat
 * @param useNewApi Use new mapreduce API or old mapred API
 * @return
 * @throws IOException
 */
public static byte[] createUserPayload(Configuration conf, 
    String outputFormatName, boolean useNewApi) throws IOException {
  Configuration outputConf = new JobConf(conf);
  outputConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatName);
  outputConf.setBoolean("mapred.mapper.new-api", useNewApi);
  MRHelpers.translateVertexConfToTez(outputConf);
  MRHelpers.doJobClientMagic(outputConf);
  return TezUtils.createUserPayloadFromConf(outputConf);
}
 
Example #25
Source File: MROutput.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private String getOutputFileNamePrefix() {
  String prefix = jobConf.get(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX);
  if (prefix == null) {
    prefix = "part-v" + 
        nonTaskNumberFormat.format(getContext().getTaskVertexIndex()) +  
        "-o" + nonTaskNumberFormat.format(getContext().getOutputIndex());
  }
  return prefix;
}
 
Example #26
Source File: MROutput.java    From tez with Apache License 2.0 4 votes vote down vote up
protected List<Event> initializeBase() throws IOException, InterruptedException {
  getContext().requestInitialMemory(0l, null); //mandatory call
  taskNumberFormat.setMinimumIntegerDigits(5);
  taskNumberFormat.setGroupingUsed(false);
  nonTaskNumberFormat.setMinimumIntegerDigits(3);
  nonTaskNumberFormat.setGroupingUsed(false);
  UserPayload userPayload = getContext().getUserPayload();
  this.jobConf = new JobConf(getContext().getContainerConfiguration());
  TezUtils.addToConfFromByteString(this.jobConf, ByteString.copyFrom(userPayload.getPayload()));
  // Add tokens to the jobConf - in case they are accessed within the RW / OF
  jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
  this.isMapperOutput = jobConf.getBoolean(MRConfig.IS_MAP_PROCESSOR,
      false);
  if (this.isMapperOutput) {
    this.useNewApi = this.jobConf.getUseNewMapper();
  } else {
    this.useNewApi = this.jobConf.getUseNewReducer();
  }
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      getContext().getDAGAttemptNumber());
  TaskAttemptID taskAttemptId = org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl
      .createMockTaskAttemptID(getContext().getApplicationId().getClusterTimestamp(),
          getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
          getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), isMapperOutput);
  jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
  jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
  jobConf.setBoolean(JobContext.TASK_ISMAP, isMapperOutput);
  jobConf.setInt(JobContext.TASK_PARTITION,
    taskAttemptId.getTaskID().getId());
  jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());
  
  String outputFormatClassName;

  outputRecordCounter = getContext().getCounters().findCounter(
      TaskCounter.OUTPUT_RECORDS);

  if (useNewApi) {
    // set the output part name to have a unique prefix
    if (jobConf.get(MRJobConfig.FILEOUTPUTFORMAT_BASE_OUTPUT_NAME) == null) {
      jobConf.set(MRJobConfig.FILEOUTPUTFORMAT_BASE_OUTPUT_NAME,
          getOutputFileNamePrefix());
    }

    newApiTaskAttemptContext = createTaskAttemptContext(taskAttemptId);
    try {
      newOutputFormat =
          org.apache.hadoop.util.ReflectionUtils.newInstance(
              newApiTaskAttemptContext.getOutputFormatClass(), jobConf);
      outputFormatClassName = newOutputFormat.getClass().getName();
    } catch (ClassNotFoundException cnfe) {
      throw new IOException(cnfe);
    }

    initCommitter(jobConf, useNewApi);
  } else {
    oldApiTaskAttemptContext =
        new org.apache.tez.mapreduce.hadoop.mapred.TaskAttemptContextImpl(
            jobConf, taskAttemptId,
            new MRTaskReporter(getContext()));
    oldOutputFormat = jobConf.getOutputFormat();
    outputFormatClassName = oldOutputFormat.getClass().getName();

    initCommitter(jobConf, useNewApi);
  }

  LOG.info(getContext().getDestinationVertexName() + ": "
      + "outputFormat=" + outputFormatClassName
      + ", using newmapreduce API=" + useNewApi);
  return null;
}
 
Example #27
Source File: TestMapProcessor.java    From tez with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 30000)
public void testMapProcessorProgress() throws Exception {
  String dagName = "mrdag0";
  String vertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
  JobConf jobConf = new JobConf(defaultConf);
  setUpJobConf(jobConf);

  MRHelpers.translateMRConfToTez(jobConf);
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

  jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

  jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir,
      "localized-resources").toUri().toString());

  Path mapInput = new Path(workDir, "map0");


  MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput, 100000);

  InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
      InputDescriptor.create(MRInputLegacy.class.getName())
          .setUserPayload(UserPayload.create(ByteBuffer.wrap(
              MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                  .setConfigurationBytes(TezUtils.createByteStringFromConf
                      (jobConf)).build()
                  .toByteArray()))),
      1);
  OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex",
      OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
          .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);

  TezSharedExecutor sharedExecutor = new TezSharedExecutor(jobConf);
  final LogicalIOProcessorRuntimeTask task = MapUtils.createLogicalTask
      (localFs, workDir, jobConf, 0,
          new Path(workDir, "map0"), new TestUmbilical(), dagName, vertexName,
          Collections.singletonList(mapInputSpec),
          Collections.singletonList(mapOutputSpec), sharedExecutor);

  ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
  Thread monitorProgress = new Thread(new Runnable() {
    @Override
    public void run() {
      float prog = task.getProgress();
      if(prog > 0.0f && prog < 1.0f)
        progressUpdate = prog;
    }
  });

  task.initialize();
  scheduler.scheduleAtFixedRate(monitorProgress, 0, 1,
      TimeUnit.MILLISECONDS);
  task.run();
  Assert.assertTrue("Progress Updates should be captured!",
      progressUpdate > 0.0f && progressUpdate < 1.0f);
  task.close();
  sharedExecutor.shutdownNow();
}
 
Example #28
Source File: TezDagBuilder.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Return EdgeProperty that connects two vertices.
 *
 * @param from
 * @param to
 * @return EdgeProperty
 * @throws IOException
 */
private EdgeProperty newEdge(TezOperator from, TezOperator to)
        throws IOException {
    TezEdgeDescriptor edge = to.inEdges.get(from.getOperatorKey());
    PhysicalPlan combinePlan = edge.combinePlan;

    InputDescriptor in = InputDescriptor.create(edge.inputClassName);
    OutputDescriptor out = OutputDescriptor.create(edge.outputClassName);

    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties(), false);
    if (!combinePlan.isEmpty()) {
        addCombiner(combinePlan, to, conf);
    }

    List<POLocalRearrangeTez> lrs = PlanHelper.getPhysicalOperators(from.plan,
            POLocalRearrangeTez.class);

    for (POLocalRearrangeTez lr : lrs) {
        if (lr.getOutputKey().equals(to.getOperatorKey().toString())) {
            byte keyType = lr.getKeyType();
            setIntermediateOutputKeyValue(keyType, conf, to, lr.isConnectedToPackage());
            // In case of secondary key sort, main key type is the actual key type
            conf.set("pig.reduce.key.type", Byte.toString(lr.getMainKeyType()));
            break;
        }
    }

    conf.setIfUnset(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS,
            MRPartitioner.class.getName());

    if (edge.getIntermediateOutputKeyClass() != null) {
        conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS,
                edge.getIntermediateOutputKeyClass());
    }

    if (edge.getIntermediateOutputValueClass() != null) {
        conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS,
                edge.getIntermediateOutputValueClass());
    }

    if (edge.getIntermediateOutputKeyComparatorClass() != null) {
        conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_COMPARATOR_CLASS,
                edge.getIntermediateOutputKeyComparatorClass());
    }

    conf.setBoolean(MRConfiguration.MAPPER_NEW_API, true);
    conf.set("pig.pigContext", ObjectSerializer.serialize(pc));
    conf.set("udf.import.list",
            ObjectSerializer.serialize(PigContext.getPackageImportList()));

    if(to.isGlobalSort() || to.isLimitAfterSort()){
        conf.set("pig.sortOrder",
                ObjectSerializer.serialize(to.getSortOrder()));
    }

    if (edge.isUseSecondaryKey()) {
        conf.set("pig.secondarySortOrder",
                ObjectSerializer.serialize(edge.getSecondarySortOrder()));
        conf.set(org.apache.hadoop.mapreduce.MRJobConfig.PARTITIONER_CLASS_ATTR,
                SecondaryKeyPartitioner.class.getName());
        // These needs to be on the vertex as well for POShuffleTezLoad to pick it up.
        // Tez framework also expects this to be per vertex and not edge. IFile.java picks
        // up keyClass and valueClass from vertex config. TODO - check with Tez folks
        // In MR - job.setSortComparatorClass() or MRJobConfig.KEY_COMPARATOR
        conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_COMPARATOR_CLASS,
                PigSecondaryKeyComparator.class.getName());
        // In MR - job.setOutputKeyClass() or MRJobConfig.OUTPUT_KEY_CLASS
        conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, NullableTuple.class.getName());
        setGroupingComparator(conf, PigSecondaryKeyGroupComparator.class.getName());
    }

    if (edge.partitionerClass != null) {
        conf.set(org.apache.hadoop.mapreduce.MRJobConfig.PARTITIONER_CLASS_ATTR,
                edge.partitionerClass.getName());
    }

    conf.set("udf.import.list",
            ObjectSerializer.serialize(PigContext.getPackageImportList()));

    MRToTezHelper.processMRSettings(conf, globalConf);

    String historyString = convertToHistoryText("", conf);
    in.setUserPayload(TezUtils.createUserPayloadFromConf(conf)).setHistoryText(historyString);
    out.setUserPayload(TezUtils.createUserPayloadFromConf(conf)).setHistoryText(historyString);

    if (edge.dataMovementType!=DataMovementType.BROADCAST && to.getEstimatedParallelism()!=-1 && (to.isGlobalSort()||to.isSkewedJoin())) {
        // Use custom edge
        return EdgeProperty.create((EdgeManagerPluginDescriptor)null,
                edge.dataSourceType, edge.schedulingType, out, in);
        }

    return EdgeProperty.create(edge.dataMovementType, edge.dataSourceType,
            edge.schedulingType, out, in);
}
 
Example #29
Source File: ResourceMgrDelegate.java    From tez with Apache License 2.0 4 votes vote down vote up
public String getSystemDir() throws IOException, InterruptedException {
  Path sysDir = new Path(MRJobConfig.JOB_SUBMIT_DIR);
  //FileContext.getFileContext(conf).delete(sysDir, true);
  return sysDir.toString();
}
 
Example #30
Source File: YARNRunner.java    From tez with Apache License 2.0 4 votes vote down vote up
private Map<String, LocalResource> createJobLocalResources(
    Configuration jobConf, String jobSubmitDir)
    throws IOException {

  // Setup LocalResources
  Map<String, LocalResource> localResources =
      new HashMap<String, LocalResource>();

  Path jobConfPath = new Path(jobSubmitDir, MRJobConfig.JOB_CONF_FILE);

  URL yarnUrlForJobSubmitDir = ConverterUtils
      .getYarnUrlFromPath(defaultFileContext.getDefaultFileSystem()
          .resolvePath(
              defaultFileContext.makeQualified(new Path(jobSubmitDir))));
  LOG.debug("Creating setup context, jobSubmitDir url is "
      + yarnUrlForJobSubmitDir);

  localResources.put(MRJobConfig.JOB_CONF_FILE,
      createApplicationResource(defaultFileContext,
          jobConfPath, LocalResourceType.FILE));
  if (jobConf.get(MRJobConfig.JAR) != null) {
    Path jobJarPath = new Path(jobConf.get(MRJobConfig.JAR));
    LocalResource rc = createApplicationResource(defaultFileContext,
        jobJarPath,
        LocalResourceType.FILE);
    // FIXME fix pattern support
    // String pattern = conf.getPattern(JobContext.JAR_UNPACK_PATTERN,
    // JobConf.UNPACK_JAR_PATTERN_DEFAULT).pattern();
    // rc.setPattern(pattern);
    localResources.put(MRJobConfig.JOB_JAR, rc);
  } else {
    // Job jar may be null. For e.g, for pipes, the job jar is the hadoop
    // mapreduce jar itself which is already on the classpath.
    LOG.info("Job jar is not present. "
        + "Not adding any jar to the list of resources.");
  }

  // TODO gross hack
  for (String s : new String[] {
      MRJobConfig.JOB_SPLIT,
      MRJobConfig.JOB_SPLIT_METAINFO}) {
    localResources.put(s,
        createApplicationResource(defaultFileContext,
            new Path(jobSubmitDir, s), LocalResourceType.FILE));
  }

  MRApps.setupDistributedCache(jobConf, localResources);

  return localResources;
}