Java Code Examples for org.apache.hadoop.mapred.JobConf#getInt()

The following examples show how to use org.apache.hadoop.mapred.JobConf#getInt() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LinkRank.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Configures the job, sets the damping factor, rank one score, and other
 * needed values for analysis.
 */
public void configure(JobConf conf) {

  try {
    this.conf = conf;
    this.dampingFactor = conf.getFloat("link.analyze.damping.factor", 0.85f);
    this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f);
    this.itNum = conf.getInt("link.analyze.iteration", 0);
    limitPages = conf.getBoolean("link.ignore.limit.page", true);
    limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
  }
  catch (Exception e) {
    LOG.error(StringUtils.stringifyException(e));
    throw new IllegalArgumentException(e);
  }
}
 
Example 2
Source File: AvroAsJsonOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 6 votes vote down vote up
static <K> void configureDataFileWriter(DataFileWriter<K> writer,
    JobConf job) throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ?
            CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
            DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String,String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                   URLDecoder.decode(e.getValue(), "ISO-8859-1")
                   .getBytes("ISO-8859-1"));
    }
}
 
Example 3
Source File: ImportRecordReaderFactory.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}
 
Example 4
Source File: DistCp.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/** Mapper configuration.
 * Extracts source and destination file system, as well as
 * top-level paths on source and destination directories.
 * Gets the named file systems, to be used later in map.
 */
public void configure(JobConf job)
{
  destPath = new Path(job.get(DST_DIR_LABEL, "/"));
  try {
    destFileSys = destPath.getFileSystem(job);
  } catch (IOException ex) {
    throw new RuntimeException("Unable to get the named file system.", ex);
  }
  sizeBuf = job.getInt("copy.buf.size", 128 * 1024);
  buffer = new byte[sizeBuf];
  ignoreReadFailures = job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false);
  preserve_status = job.getBoolean(Options.PRESERVE_STATUS.propertyname, false);
  if (preserve_status) {
    preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL));
  }
  update = job.getBoolean(Options.UPDATE.propertyname, false);
  overwrite = !update && job.getBoolean(Options.OVERWRITE.propertyname, false);
  this.job = job;
}
 
Example 5
Source File: MultithreadedMapRunner.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public void configure(JobConf jobConf) {
  int numberOfThreads =
    jobConf.getInt("mapred.map.multithreadedrunner.threads", 10);
  if (LOG.isDebugEnabled()) {
    LOG.debug("Configuring jobConf " + jobConf.getJobName() +
              " to use " + numberOfThreads + " threads");
  }

  this.job = jobConf;
  //increment processed counter only if skipping feature is enabled
  this.incrProcCount = SkipBadRecords.getMapperMaxSkipRecords(job)>0 && 
    SkipBadRecords.getAutoIncrMapperProcCount(job);
  this.mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(),
      jobConf);

  // Creating a threadpool of the configured size to execute the Mapper
  // map method in parallel.
  executorService = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 
                                           0L, TimeUnit.MILLISECONDS,
                                           new BlockingArrayQueue
                                             (numberOfThreads));
}
 
Example 6
Source File: MultithreadedMapRunner.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public void configure(JobConf jobConf) {
  int numberOfThreads =
    jobConf.getInt(MultithreadedMapper.NUM_THREADS, 10);
  if (LOG.isDebugEnabled()) {
    LOG.debug("Configuring jobConf " + jobConf.getJobName() +
              " to use " + numberOfThreads + " threads");
  }

  this.job = jobConf;
  //increment processed counter only if skipping feature is enabled
  this.incrProcCount = SkipBadRecords.getMapperMaxSkipRecords(job)>0 && 
    SkipBadRecords.getAutoIncrMapperProcCount(job);
  this.mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(),
      jobConf);

  // Creating a threadpool of the configured size to execute the Mapper
  // map method in parallel.
  executorService = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 
                                           0L, TimeUnit.MILLISECONDS,
                                           new BlockingArrayQueue
                                             (numberOfThreads));
}
 
Example 7
Source File: ValueAggregatorJobBase.java    From big-c with Apache License 2.0 5 votes vote down vote up
private static ArrayList<ValueAggregatorDescriptor> getAggregatorDescriptors(JobConf job) {
  String advn = "aggregator.descriptor";
  int num = job.getInt(advn + ".num", 0);
  ArrayList<ValueAggregatorDescriptor> retv = new ArrayList<ValueAggregatorDescriptor>(num);
  for (int i = 0; i < num; i++) {
    String spec = job.get(advn + "." + i);
    ValueAggregatorDescriptor ad = getValueAggregatorDescriptor(spec, job);
    if (ad != null) {
      retv.add(ad);
    }
  }
  return retv;
}
 
Example 8
Source File: RandomWriter.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Save the values out of the configuaration that we need to write
 * the data.
 */
@Override
public void configure(JobConf job) {
  numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
                                1*1024*1024*1024);
  minKeySize = job.getInt("test.randomwrite.min_key", 10);
  keySizeRange = 
    job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
  minValueSize = job.getInt("test.randomwrite.min_value", 0);
  valueSizeRange = 
    job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}
 
Example 9
Source File: FreeGenerator.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
@Override
public void configure(JobConf job) {
  super.configure(job);
  defaultInterval = job.getInt("db.fetch.interval.default", 0);
  scfilters = new ScoringFilters(job);
  if (job.getBoolean(FILTER_KEY, false)) {
    filters = new URLFilters(job);
  }
  if (job.getBoolean(NORMALIZE_KEY, false)) {
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
  }
}
 
Example 10
Source File: DistCp.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * @param job The handle to the JobConf object
 * @param numSplits Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
  long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
  String srcFileList = job.get(SRC_LIST_LABEL, "");
  Path srcFileListPath = new Path(srcFileList);
  if (cnfiles < 0 || cbsize < 0 || "".equals(srcFileList)) {
    throw new RuntimeException("Invalid metadata: #files(" + cnfiles +
        ") total_size(" + cbsize + ") src_chunk_file_list_uri(" +
        srcFileList + ")");
  }
  ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
  SequenceFile.Reader sl = null;
  String splitList = job.get(SPLIT_LIST_LABEL, "");
  if("".equals(splitList)) {
    throw new RuntimeException("Invalid metadata: split_list_uri(" +
        srcFileList + ")");
  }
  //split file list which contains start pos and split length pairs
  //they are used to split srcChunkFileList
  Path splitListPath = new Path(splitList);        
  FileSystem splitListFs = splitListPath.getFileSystem(job);
  try{
    sl = new SequenceFile.Reader(splitListFs, splitListPath, job);
    LongWritable startpos = new LongWritable();
    LongWritable length = new LongWritable();
    while (sl.next(startpos, length)) {
      splits.add(new FileSplit(srcFileListPath, startpos.get(), 
          length.get(), (String[])null));
    }
  }
  finally{
    checkAndClose(sl);
  }
  return splits.toArray(new FileSplit[splits.size()]);
}
 
Example 11
Source File: RandomWriter.java    From hadoop-book with Apache License 2.0 5 votes vote down vote up
/**
 * Save the values out of the configuaration that we need to write the
 * data.
 */
@Override
public void configure(JobConf job) {
    numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
            1 * 1024 * 1024 * 1024);
    minKeySize = job.getInt("test.randomwrite.min_key", 10);
    keySizeRange =
            job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
    minValueSize = job.getInt("test.randomwrite.min_value", 0);
    valueSizeRange =
            job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}
 
Example 12
Source File: AvroRecordWriter.java    From spork with Apache License 2.0 5 votes vote down vote up
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}
 
Example 13
Source File: RandomWriter.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Save the values out of the configuaration that we need to write
 * the data.
 */
@Override
public void configure(JobConf job) {
  numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
                                1*1024*1024*1024);
  minKeySize = job.getInt("test.randomwrite.min_key", 10);
  keySizeRange = 
    job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
  minValueSize = job.getInt("test.randomwrite.min_value", 0);
  valueSizeRange = 
    job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}
 
Example 14
Source File: NLineInputFormat.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf conf) {
  N = conf.getInt("mapreduce.input.lineinputformat.linespermap", 1);
}
 
Example 15
Source File: AbstractMROldApiSaveTest.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    return super.getSplits(job, job.getInt("actual.splits", 3));
}
 
Example 16
Source File: GenerateDistCacheData.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobConf jobConf = new JobConf(jobCtxt.getConfiguration());
  final JobClient client = new JobClient(jobConf);
  ClusterStatus stat = client.getClusterStatus(true);
  int numTrackers = stat.getTaskTrackers();
  final int fileCount = jobConf.getInt(GRIDMIX_DISTCACHE_FILE_COUNT, -1);

  // Total size of distributed cache files to be generated
  final long totalSize = jobConf.getLong(GRIDMIX_DISTCACHE_BYTE_COUNT, -1);
  // Get the path of the special file
  String distCacheFileList = jobConf.get(GRIDMIX_DISTCACHE_FILE_LIST);
  if (fileCount < 0 || totalSize < 0 || distCacheFileList == null) {
    throw new RuntimeException("Invalid metadata: #files (" + fileCount
        + "), total_size (" + totalSize + "), filelisturi ("
        + distCacheFileList + ")");
  }

  Path sequenceFile = new Path(distCacheFileList);
  FileSystem fs = sequenceFile.getFileSystem(jobConf);
  FileStatus srcst = fs.getFileStatus(sequenceFile);
  // Consider the number of TTs * mapSlotsPerTracker as number of mappers.
  int numMapSlotsPerTracker = jobConf.getInt(TTConfig.TT_MAP_SLOTS, 2);
  int numSplits = numTrackers * numMapSlotsPerTracker;

  List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  // Average size of data to be generated by each map task
  final long targetSize = Math.max(totalSize / numSplits,
                            DistributedCacheEmulator.AVG_BYTES_PER_MAP);
  long splitStartPosition = 0L;
  long splitEndPosition = 0L;
  long acc = 0L;
  long bytesRemaining = srcst.getLen();
  SequenceFile.Reader reader = null;
  try {
    reader = new SequenceFile.Reader(fs, sequenceFile, jobConf);
    while (reader.next(key, value)) {

      // If adding this file would put this split past the target size,
      // cut the last split and put this file in the next split.
      if (acc + key.get() > targetSize && acc != 0) {
        long splitSize = splitEndPosition - splitStartPosition;
        splits.add(new FileSplit(
            sequenceFile, splitStartPosition, splitSize, (String[])null));
        bytesRemaining -= splitSize;
        splitStartPosition = splitEndPosition;
        acc = 0L;
      }
      acc += key.get();
      splitEndPosition = reader.getPosition();
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  if (bytesRemaining != 0) {
    splits.add(new FileSplit(
        sequenceFile, splitStartPosition, bytesRemaining, (String[])null));
  }

  return splits;
}
 
Example 17
Source File: DistRaid.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * 
 * @param job
 *          The handle to the JobConf object
 * @param numSplits
 *          Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
  final int targetcount = srcCount / numSplits;
  String srclist = job.get(OP_LIST_LABEL, "");
  if (srcCount < 0 || "".equals(srclist)) {
    throw new RuntimeException("Invalid metadata: #files(" + srcCount
        + ") listuri(" + srclist + ")");
  }
  Path srcs = new Path(srclist);
  FileSystem fs = srcs.getFileSystem(job);

  List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

  Text key = new Text();
  PolicyInfo value = new PolicyInfo();
  SequenceFile.Reader in = null;
  long prev = 0L;
  int count = 0; // count src
  try {
    for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value);) {
      long curr = in.getPosition();
      long delta = curr - prev;
      if (++count > targetcount) {
        count = 0;
        splits.add(new FileSplit(srcs, prev, delta, (String[]) null));
        prev = curr;
      }
    }
  } finally {
    in.close();
  }
  long remaining = fs.getFileStatus(srcs).getLen() - prev;
  if (remaining != 0) {
    splits.add(new FileSplit(srcs, prev, remaining, (String[]) null));
  }
  LOG.info("jobname= " + jobName + " numSplits=" + numSplits + 
           ", splits.size()=" + splits.size());
  return splits.toArray(new FileSplit[splits.size()]);
}
 
Example 18
Source File: RegexMapper.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf job) {
  pattern = Pattern.compile(job.get(org.apache.hadoop.mapreduce.lib.map.
              RegexMapper.PATTERN));
  group = job.getInt(org.apache.hadoop.mapreduce.lib.map.
            RegexMapper.GROUP, 0);
}
 
Example 19
Source File: DistCp.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * @param job The handle to the JobConf object
 * @param numSplits Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
  long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
  long blocks = job.getLong(TOTAL_BLOCKS_LABEL, -1);
  String srcfilelist = job.get(SRC_LIST_LABEL, "");
  if (cnfiles < 0 || cbsize < 0 || blocks < 0 || "".equals(srcfilelist)) {
    throw new RuntimeException("Invalid metadata: #files(" + cnfiles +
                               ") total_size(" + cbsize + ") listuri(" +
                               srcfilelist + ")");
  }
  Path src = new Path(srcfilelist);
  FileSystem fs = src.getFileSystem(job);
  FileStatus srcst = fs.getFileStatus(src);

  ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
  LongWritable key = new LongWritable();
  FilePairComparable value = new FilePairComparable();
  final long targetsize = getTargetSize(job, numSplits);
  long pos = 0L;
  long last = 0L;
  long acc = 0L;
  long cbrem = srcst.getLen();
  SequenceFile.Reader sl = null;
  try {
    sl = new SequenceFile.Reader(fs, src, job);
    for (; sl.next(key, value); last = sl.getPosition()) {
      // if adding this split would put this split past the target size,
      // cut the last split and put this next file in the next split.
      long increment = getIncrement(key, value);
      if (acc + increment > targetsize && acc != 0) {
        long splitsize = last - pos;
        splits.add(new FileSplit(src, pos, splitsize, (String[])null));
        cbrem -= splitsize;
        pos = last;
        acc = 0L;
      }
      acc += increment;
    }
  }
  finally {
    checkAndClose(sl);
  }
  if (cbrem != 0) {
    splits.add(new FileSplit(src, pos, cbrem, (String[])null));
  }

  return splits.toArray(new FileSplit[splits.size()]);
}
 
Example 20
Source File: TaskCalculator.java    From emr-dynamodb-connector with Apache License 2.0 4 votes vote down vote up
public int getMaxMapTasks() throws IOException {
  JobConf conf = (JobConf) jobClient.getConf();

  // Total number of nodes in the cluster
  int nodes = jobClient.getClusterStatus().getTaskTrackers();
  log.info("Cluster has " + nodes + " active nodes.");
  if (nodes == 0) {
    log.warn("Cluster doesn't have any nodes");
    return 0;
  }

  // Memory per slot
  int slotMemory = conf.getInt("yarn.scheduler.minimum-allocation-mb", 1024); // Default value
  // from yarn-default.xml

  // Number of slots in a core node
  int nodeMemory = nodeCapacityProvider.getCoreNodeMemoryMB();
  int nodeSlots = nodeMemory / slotMemory;

  // Number of slots for a mapper
  int mapMemory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
  int mapSlots = (int) Math.ceil((double) mapMemory / slotMemory);

  // Number of slots for an application master
  int amMemory = conf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB);
  int appMasterSlots = (int) Math.ceil((double) amMemory / slotMemory);

  // Number of slots for a reducer
  int reduceMemory = conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig
      .DEFAULT_REDUCE_MEMORY_MB);
  int reduceSlots = (int) Math.ceil((double) reduceMemory / slotMemory);

  // Number of reducers
  int reducers = conf.getNumReduceTasks();

  // Calculate the number of mappers
  int mappers = yarnContainerAllocator.getMaxMappers(nodes, reducers, nodeSlots,
      appMasterSlots, mapSlots, reduceSlots);

  log.info("Slot size: " + slotMemory + "MB.");
  log.info("Node manager can allocate " + nodeMemory + "MB (" + nodeSlots + " slots) for "
      + "containers on each node.");
  log.info("Each mapper needs: " + mapMemory + "MB. (" + mapSlots + " slots)");
  log.info("Each reducer needs: " + reduceMemory + "MB. (" + reduceSlots + " slots)");
  log.info("MapReduce Application Manager needs: " + amMemory + " MB. (" + appMasterSlots + " "
      + "slots)");
  log.info("Number of reducers: " + reducers);
  log.info("Max number of cluster map tasks: " + mappers);

  if (mappers < 1) {
    log.warn("The calculated max number of concurrent map tasks is less than 1. Use 1 instead.");
    mappers = 1;
  }

  return mappers;
}