Java Code Examples for org.apache.hadoop.mapred.JobClient#getClusterStatus()

The following examples show how to use org.apache.hadoop.mapred.JobClient#getClusterStatus() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GenerateData.java    From RDFS with Apache License 2.0 6 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobClient client =
    new JobClient(new JobConf(jobCtxt.getConfiguration()));
  ClusterStatus stat = client.getClusterStatus(true);
  final long toGen =
    jobCtxt.getConfiguration().getLong(GRIDMIX_GEN_BYTES, -1);
  if (toGen < 0) {
    throw new IOException("Invalid/missing generation bytes: " + toGen);
  }
  final int nTrackers = stat.getTaskTrackers();
  final long bytesPerTracker = toGen / nTrackers;
  final ArrayList<InputSplit> splits = new ArrayList<InputSplit>(nTrackers);
  final Pattern trackerPattern = Pattern.compile("tracker_([^:]*):.*");
  final Matcher m = trackerPattern.matcher("");
  for (String tracker : stat.getActiveTrackerNames()) {
    m.reset(tracker);
    if (!m.find()) {
      System.err.println("Skipping node: " + tracker);
      continue;
    }
    final String name = m.group(1);
    splits.add(new GenSplit(bytesPerTracker, new String[] { name }));
  }
  return splits;
}
 
Example 2
Source File: InfrastructureAnalyzer.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes properties of hadoop cluster and configuration.
 */
private static void analyzeHadoopCluster() {
	try {
		JobConf job = ConfigurationManager.getCachedJobConf();
		JobClient client = new JobClient(job);
		ClusterStatus stat = client.getClusterStatus();
		if( stat != null ) { //if in cluster mode
			//analyze cluster status
			_remotePar = stat.getTaskTrackers();
			_remoteParMap = stat.getMaxMapTasks(); 
			_remoteParReduce = stat.getMaxReduceTasks(); 
			
			//analyze pure configuration properties
			analyzeHadoopConfiguration();
		}
	} 
	catch (IOException e) {
		throw new RuntimeException("Unable to analyze infrastructure.",e);
	}
}
 
Example 3
Source File: GenerateData.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobClient client =
    new JobClient(new JobConf(jobCtxt.getConfiguration()));
  ClusterStatus stat = client.getClusterStatus(true);
  final long toGen =
    jobCtxt.getConfiguration().getLong(GRIDMIX_GEN_BYTES, -1);
  if (toGen < 0) {
    throw new IOException("Invalid/missing generation bytes: " + toGen);
  }
  final int nTrackers = stat.getTaskTrackers();
  final long bytesPerTracker = toGen / nTrackers;
  final ArrayList<InputSplit> splits = new ArrayList<InputSplit>(nTrackers);
  final Pattern trackerPattern = Pattern.compile("tracker_([^:]*):.*");
  final Matcher m = trackerPattern.matcher("");
  for (String tracker : stat.getActiveTrackerNames()) {
    m.reset(tracker);
    if (!m.find()) {
      System.err.println("Skipping node: " + tracker);
      continue;
    }
    final String name = m.group(1);
    splits.add(new GenSplit(bytesPerTracker, new String[] { name }));
  }
  return splits;
}
 
Example 4
Source File: InfrastructureAnalyzer.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes properties of hadoop cluster and configuration.
 */
private static void analyzeHadoopCluster() {
	try {
		JobConf job = ConfigurationManager.getCachedJobConf();
		JobClient client = new JobClient(job);
		ClusterStatus stat = client.getClusterStatus();
		if( stat != null ) { //if in cluster mode
			//analyze cluster status
			_remotePar = stat.getTaskTrackers();
			_remoteParMap = stat.getMaxMapTasks(); 
			_remoteParReduce = stat.getMaxReduceTasks(); 
			
			//analyze pure configuration properties
			analyzeHadoopConfiguration();
		}
	} 
	catch (IOException e) {
		throw new RuntimeException("Unable to analyze infrastructure.",e);
	}
}
 
Example 5
Source File: TestCluster.java    From imputationserver with GNU Affero General Public License v3.0 5 votes vote down vote up
public void start() throws IOException {

		File testCluster = new File(WORKING_DIRECTORY);
		if (testCluster.exists()) {
			FileUtil.deleteDirectory(testCluster);
		}
		testCluster.mkdirs();
		
		File testClusterData = new File(WORKING_DIRECTORY + "/data");
		File testClusterLog = new File(WORKING_DIRECTORY + "/logs");

		
		if (cluster == null) {

			conf = new HdfsConfiguration();		
			conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR,
					testClusterData.getAbsolutePath());
			cluster = new MiniDFSCluster.Builder(conf).build();
			fs = cluster.getFileSystem();

			// set mincluster as default config
			HdfsUtil.setDefaultConfiguration(conf);
			System.setProperty("hadoop.log.dir", testClusterLog.getAbsolutePath());
			MiniMRCluster mrCluster = new MiniMRCluster(1, fs.getUri()
					.toString(), 1, null, null, new JobConf(conf));
			JobConf mrClusterConf = mrCluster.createJobConf();
			HdfsUtil.setDefaultConfiguration(new Configuration(mrClusterConf));

			System.out.println("------");

			JobClient client = new JobClient(mrClusterConf);
			ClusterStatus status = client.getClusterStatus(true);
			System.out.println(status.getActiveTrackerNames());
		}
	}
 
Example 6
Source File: RandomWriter.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }
  
  Path outDir = new Path(args[0]);
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomMapper.class);        
  job.setReducerClass(Reducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 7
Source File: RandomWriter.java    From RDFS with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return -1;
  }
  
  Path outDir = new Path(args[0]);
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  
  job.setInputFormat(RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  job.setReducerClass(IdentityReducer.class);
  job.setOutputFormat(SequenceFileOutputFormat.class);
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
  }
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}
 
Example 8
Source File: RandomTextWriter.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormat(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite);
  }
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormat(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}
 
Example 9
Source File: GenerateDistCacheData.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobConf jobConf = new JobConf(jobCtxt.getConfiguration());
  final JobClient client = new JobClient(jobConf);
  ClusterStatus stat = client.getClusterStatus(true);
  int numTrackers = stat.getTaskTrackers();
  final int fileCount = jobConf.getInt(GRIDMIX_DISTCACHE_FILE_COUNT, -1);

  // Total size of distributed cache files to be generated
  final long totalSize = jobConf.getLong(GRIDMIX_DISTCACHE_BYTE_COUNT, -1);
  // Get the path of the special file
  String distCacheFileList = jobConf.get(GRIDMIX_DISTCACHE_FILE_LIST);
  if (fileCount < 0 || totalSize < 0 || distCacheFileList == null) {
    throw new RuntimeException("Invalid metadata: #files (" + fileCount
        + "), total_size (" + totalSize + "), filelisturi ("
        + distCacheFileList + ")");
  }

  Path sequenceFile = new Path(distCacheFileList);
  FileSystem fs = sequenceFile.getFileSystem(jobConf);
  FileStatus srcst = fs.getFileStatus(sequenceFile);
  // Consider the number of TTs * mapSlotsPerTracker as number of mappers.
  int numMapSlotsPerTracker = jobConf.getInt(TTConfig.TT_MAP_SLOTS, 2);
  int numSplits = numTrackers * numMapSlotsPerTracker;

  List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  // Average size of data to be generated by each map task
  final long targetSize = Math.max(totalSize / numSplits,
                            DistributedCacheEmulator.AVG_BYTES_PER_MAP);
  long splitStartPosition = 0L;
  long splitEndPosition = 0L;
  long acc = 0L;
  long bytesRemaining = srcst.getLen();
  SequenceFile.Reader reader = null;
  try {
    reader = new SequenceFile.Reader(fs, sequenceFile, jobConf);
    while (reader.next(key, value)) {

      // If adding this file would put this split past the target size,
      // cut the last split and put this file in the next split.
      if (acc + key.get() > targetSize && acc != 0) {
        long splitSize = splitEndPosition - splitStartPosition;
        splits.add(new FileSplit(
            sequenceFile, splitStartPosition, splitSize, (String[])null));
        bytesRemaining -= splitSize;
        splitStartPosition = splitEndPosition;
        acc = 0L;
      }
      acc += key.get();
      splitEndPosition = reader.getPosition();
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  if (bytesRemaining != 0) {
    splits.add(new FileSplit(
        sequenceFile, splitStartPosition, bytesRemaining, (String[])null));
  }

  return splits;
}
 
Example 10
Source File: RandomWriter.java    From tez with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
@SuppressWarnings("deprecation")
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }
  
  Path outDir = new Path(args[0]);
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = new Job(conf);
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomMapper.class);        
  job.setReducerClass(Reducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 11
Source File: RandomWriter.java    From hadoop-book with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job. It
 * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The
 * reduce doesn't do anything.
 *
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        System.out.println("Usage: writer <out-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map",
            1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
        return -2;
    }
    long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + (endTime.getTime() - startTime.getTime()) / 1000
            + " seconds.");

    return 0;
}
 
Example 12
Source File: RandomTextWriter.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
  
  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);        
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormatClass(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 13
Source File: RandomTextWriter.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
  
  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);        
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormatClass(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 14
Source File: RandomWriter.java    From pravega-samples with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 *
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        System.out.println("Usage: writer <out-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }

    Path outDir = new Path(args[0]);
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
            1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

    Job job = Job.getInstance(conf);

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setInputFormatClass(RandomInputFormat.class);
    job.setMapperClass(RandomMapper.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " +
            (endTime.getTime() - startTime.getTime()) / 1000 +
            " seconds.");

    return ret;
}
 
Example 15
Source File: RandomTextWriter.java    From pravega-samples with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 *
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length < 4) {
        return printUsage();
    }

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
            1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES,
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

    conf.setStrings("pravega.uri", args[1]);
    conf.setStrings("pravega.scope", args[2]);
    conf.setStrings("pravega.stream", args[3]);
    conf.setStrings("pravega.deserializer", TextSerializer.class.getName());

    Job job = Job.getInstance(conf);

    job.setJarByClass(RandomTextWriter.class);
    job.setJobName("random-text-writer");

    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
    job.setMapperClass(RandomTextMapper.class);
    job.setOutputFormatClass(PravegaFixedSegmentsOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(args[0]));

    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " +
            (endTime.getTime() - startTime.getTime()) / 1000 +
            " seconds.");

    return ret;
}
 
Example 16
Source File: GenerateDistCacheData.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobConf jobConf = new JobConf(jobCtxt.getConfiguration());
  final JobClient client = new JobClient(jobConf);
  ClusterStatus stat = client.getClusterStatus(true);
  int numTrackers = stat.getTaskTrackers();
  final int fileCount = jobConf.getInt(GRIDMIX_DISTCACHE_FILE_COUNT, -1);

  // Total size of distributed cache files to be generated
  final long totalSize = jobConf.getLong(GRIDMIX_DISTCACHE_BYTE_COUNT, -1);
  // Get the path of the special file
  String distCacheFileList = jobConf.get(GRIDMIX_DISTCACHE_FILE_LIST);
  if (fileCount < 0 || totalSize < 0 || distCacheFileList == null) {
    throw new RuntimeException("Invalid metadata: #files (" + fileCount
        + "), total_size (" + totalSize + "), filelisturi ("
        + distCacheFileList + ")");
  }

  Path sequenceFile = new Path(distCacheFileList);
  FileSystem fs = sequenceFile.getFileSystem(jobConf);
  FileStatus srcst = fs.getFileStatus(sequenceFile);
  // Consider the number of TTs * mapSlotsPerTracker as number of mappers.
  int numMapSlotsPerTracker = jobConf.getInt(TTConfig.TT_MAP_SLOTS, 2);
  int numSplits = numTrackers * numMapSlotsPerTracker;

  List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  // Average size of data to be generated by each map task
  final long targetSize = Math.max(totalSize / numSplits,
                            DistributedCacheEmulator.AVG_BYTES_PER_MAP);
  long splitStartPosition = 0L;
  long splitEndPosition = 0L;
  long acc = 0L;
  long bytesRemaining = srcst.getLen();
  SequenceFile.Reader reader = null;
  try {
    reader = new SequenceFile.Reader(fs, sequenceFile, jobConf);
    while (reader.next(key, value)) {

      // If adding this file would put this split past the target size,
      // cut the last split and put this file in the next split.
      if (acc + key.get() > targetSize && acc != 0) {
        long splitSize = splitEndPosition - splitStartPosition;
        splits.add(new FileSplit(
            sequenceFile, splitStartPosition, splitSize, (String[])null));
        bytesRemaining -= splitSize;
        splitStartPosition = splitEndPosition;
        acc = 0L;
      }
      acc += key.get();
      splitEndPosition = reader.getPosition();
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  if (bytesRemaining != 0) {
    splits.add(new FileSplit(
        sequenceFile, splitStartPosition, bytesRemaining, (String[])null));
  }

  return splits;
}
 
Example 17
Source File: RandomWriter.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }
  
  Path outDir = new Path(args[0]);
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomMapper.class);        
  job.setReducerClass(Reducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 18
Source File: RandomTextWriter.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have " + BYTES_PER_MAP +" set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
  
  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(RandomTextMapper.class);        
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormatClass(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 19
Source File: RandomWriter.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }
  
  Path outDir = new Path(args[0]);
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = Job.getInstance(conf);
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomMapper.class);        
  job.setReducerClass(Reducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}
 
Example 20
Source File: RandomWriter.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }
  
  Path outDir = new Path(args[0]);
  Configuration conf = getConf();
  JobClient client = new JobClient(conf);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
  long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP,
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
    return -2;
  }
  long totalBytesToWrite = conf.getLong(TOTAL_BYTES, 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
  }
  conf.setInt(MRJobConfig.NUM_MAPS, numMaps);

  Job job = new Job(conf);
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  job.setInputFormatClass(RandomInputFormat.class);
  job.setMapperClass(RandomMapper.class);        
  job.setReducerClass(Reducer.class);
  job.setOutputFormatClass(SequenceFileOutputFormat.class);
  
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  int ret = job.waitForCompletion(true) ? 0 : 1;
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return ret;
}