org.apache.hadoop.mapred.jobcontrol.JobControl Java Examples

The following examples show how to use org.apache.hadoop.mapred.jobcontrol.JobControl. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ValueAggregatorJob.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]
  , Class<? extends ValueAggregatorDescriptor>[] descriptors) throws IOException {
  
  JobControl theControl = new JobControl("ValueAggregatorJobs");
  ArrayList<Job> dependingJobs = new ArrayList<Job>();
  JobConf aJobConf = createValueAggregatorJob(args);
  if(descriptors != null)
    setAggregatorDescriptors(aJobConf, descriptors);
  Job aJob = new Job(aJobConf, dependingJobs);
  theControl.addJob(aJob);
  return theControl;
}
 
Example #2
Source File: ValueAggregatorJob.java    From big-c with Apache License 2.0 5 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]
  , Class<? extends ValueAggregatorDescriptor>[] descriptors) throws IOException {
  
  JobControl theControl = new JobControl("ValueAggregatorJobs");
  ArrayList<Job> dependingJobs = new ArrayList<Job>();
  JobConf aJobConf = createValueAggregatorJob(args);
  if(descriptors != null)
    setAggregatorDescriptors(aJobConf, descriptors);
  Job aJob = new Job(aJobConf, dependingJobs);
  theControl.addJob(aJob);
  return theControl;
}
 
Example #3
Source File: ValueAggregatorJob.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]
  , Class<? extends ValueAggregatorDescriptor>[] descriptors) throws IOException {
  
  JobControl theControl = new JobControl("ValueAggregatorJobs");
  ArrayList<Job> dependingJobs = new ArrayList<Job>();
  JobConf aJobConf = createValueAggregatorJob(args);
  if(descriptors != null)
    setAggregatorDescriptors(aJobConf, descriptors);
  Job aJob = new Job(aJobConf, dependingJobs);
  theControl.addJob(aJob);
  return theControl;
}
 
Example #4
Source File: Launcher.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * Compute the progress of the current job submitted through the JobControl
 * object jc to the JobClient jobClient
 *
 * @param jc
 *            - The JobControl object that has been submitted
 * @param jobClient
 *            - The JobClient to which it has been submitted
 * @return The progress as a precentage in double format
 * @throws IOException
 */
protected double calculateProgress(JobControl jc)
        throws IOException {
    double prog = 0.0;
    prog += jc.getSuccessfulJobs().size();

    List<Job> runnJobs = jc.getRunningJobs();
    for (Job j : runnJobs) {
        prog += HadoopShims.progressOfRunningJob(j);
    }
    return prog;
}
 
Example #5
Source File: ValueAggregatorJob.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]
  , Class<? extends ValueAggregatorDescriptor>[] descriptors) throws IOException {
  
  JobControl theControl = new JobControl("ValueAggregatorJobs");
  ArrayList<Job> dependingJobs = new ArrayList<Job>();
  JobConf aJobConf = createValueAggregatorJob(args);
  if(descriptors != null)
    setAggregatorDescriptors(aJobConf, descriptors);
  Job aJob = new Job(aJobConf, dependingJobs);
  theControl.addJob(aJob);
  return theControl;
}
 
Example #6
Source File: TestGroupConstParallelMR.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void checkGroupConstWithParallelResult(PhysicalPlan pp, PigContext pc) throws Exception {
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    
    JobControl jobControl = jcc.compile(mrPlan, "Test");
    Job job = jobControl.getWaitingJobs().get(0);
    int parallel = job.getJobConf().getNumReduceTasks();

    assertEquals("parallism", 1, parallel);
}
 
Example #7
Source File: TestGroupConstParallelMR.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void checkGroupNonConstWithParallelResult(PhysicalPlan pp, PigContext pc) throws Exception {
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    
    JobControl jobControl = jcc.compile(mrPlan, "Test");
    Job job = jobControl.getWaitingJobs().get(0);
    int parallel = job.getJobConf().getNumReduceTasks();
    
    assertEquals("parallism", 100, parallel);
}
 
Example #8
Source File: TestJobControlCompiler.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * specifically tests that REGISTERED jars get added to distributed cache
 * @throws Exception
 */
@Test
public void testJarAddedToDistributedCache() throws Exception {

  // creating a jar with a UDF *not* in the current classloader
  File tmpFile = File.createTempFile("Some_", ".jar");
  tmpFile.deleteOnExit();
  String className = createTestJar(tmpFile);
  final String testUDFFileName = className+".class";

  // JobControlCompiler setup
  PigServer pigServer = new PigServer(ExecType.MAPREDUCE);
  PigContext pigContext = pigServer.getPigContext();
  pigContext.connect();
  pigContext.addJar(tmpFile.getAbsolutePath());
  JobControlCompiler jobControlCompiler = new JobControlCompiler(pigContext, CONF);
  MROperPlan plan = new MROperPlan();
  MapReduceOper mro = new MapReduceOper(new OperatorKey());
  mro.UDFs = new HashSet<String>();
  mro.UDFs.add(className+"()");
  plan.add(mro);

  // compiling the job
  JobControl jobControl = jobControlCompiler.compile(plan , "test");
  JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();

  // verifying the jar gets on distributed cache
  Path[] fileClassPaths = DistributedCache.getFileClassPaths(jobConf);
  // guava jar is not shipped with Hadoop 2.x
  Assert.assertEquals("size for "+Arrays.toString(fileClassPaths), HadoopShims.isHadoopYARN() ? 5 : 6, fileClassPaths.length);
  Path distributedCachePath = fileClassPaths[0];
  Assert.assertEquals("ends with jar name: "+distributedCachePath, distributedCachePath.getName(), tmpFile.getName());
  // hadoop bug requires path to not contain hdfs://hotname in front
  Assert.assertTrue("starts with /: "+distributedCachePath,
      distributedCachePath.toString().startsWith("/"));
  Assert.assertTrue("jar pushed to distributed cache should contain testUDF",
      jarContainsFileNamed(new File(fileClassPaths[0].toUri().getPath()), testUDFFileName));
}
 
Example #9
Source File: TestJobControlCompiler.java    From spork with Apache License 2.0 5 votes vote down vote up
private JobConf compileTestJob(final PigContext pigContext, Configuration conf)
        throws JobCreationException {
    final JobControlCompiler jobControlCompiler = new JobControlCompiler(
            pigContext, conf);

    final MROperPlan plan = new MROperPlan();
    plan.add(new MapReduceOper(new OperatorKey()));

    final JobControl jobControl = jobControlCompiler.compile(plan, "test");
    final JobConf jobConf = jobControl.getWaitingJobs().get(0).getJobConf();
    return jobConf;
}
 
Example #10
Source File: TestJobSubmissionMR.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void checkDefaultParallelResult(PhysicalPlan pp, PigContext pc) throws Exception {
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);

    JobControl jobControl = jcc.compile(mrPlan, "Test");
    Job job = jobControl.getWaitingJobs().get(0);
    int parallel = job.getJobConf().getNumReduceTasks();

    assertEquals(100, parallel);
    Util.assertParallelValues(100, -1, -1, 100, job.getJobConf());
}
 
Example #11
Source File: ValueAggregatorJob.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]) throws IOException {
  return createValueAggregatorJobs(args, null);
}
 
Example #12
Source File: ValueAggregatorJob.java    From RDFS with Apache License 2.0 4 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]) throws IOException {
  return createValueAggregatorJobs(args, null);
}
 
Example #13
Source File: HadoopShims.java    From spork with Apache License 2.0 4 votes vote down vote up
public static JobControl newJobControl(String groupName, int timeToSleep) {
  return new PigJobControl(groupName, timeToSleep);
}
 
Example #14
Source File: HadoopShims.java    From spork with Apache License 2.0 4 votes vote down vote up
public static JobControl newJobControl(String groupName, int timeToSleep) {
    return new PigJobControl(groupName, timeToSleep);
}
 
Example #15
Source File: TestJobSubmission.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testReducerNumEstimationForOrderBy() throws Exception{
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ",
            Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getProperties().setProperty("pig.exec.reducers.max", "10");

    String query = "a = load '/passwd';" +
            "b = order a by $0;" +
            "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);

    MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jobControl = jcc.compile(mrPlan, query);

    assertEquals(2, mrPlan.size());

    // first job uses a single reducer for the sampling
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    // Simulate the first job having run so estimation kicks in.
    MapReduceOper sort = mrPlan.getLeaves().get(0);
    jcc.updateMROpPlan(jobControl.getReadyJobs());
    FileLocalizer.create(sort.getQuantFile(), pc);
    jobControl = jcc.compile(mrPlan, query);

    sort = mrPlan.getLeaves().get(0);
    long reducer=Math.min((long)Math.ceil(new File("test/org/apache/pig/test/data/passwd").length()/100.0), 10);
    assertEquals(reducer, sort.getRequestedParallelism());

    // the second job estimates reducers
    Util.assertParallelValues(-1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" +
            "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);

    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);
    assertEquals(2, sort.getRequestedParallelism());

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as hbase
    query = "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" +
            "b = order a by $0 ;" +
            "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);

    // the requested parallel will be -1 if users don't set any of default_parallel, paralllel
    // and the estimation doesn't take effect. MR framework will finally set it to 1.
    assertEquals(-1, sort.getRequestedParallelism());

    // test order by with three jobs (after optimization)
    query = "a = load '/passwd';" +
            "b = foreach a generate $0, $1, $2;" +
            "c = order b by $0;" +
            "store c into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(3, mrPlan.size());

    // Simulate the first 2 jobs having run so estimation kicks in.
    sort = mrPlan.getLeaves().get(0);
    FileLocalizer.create(sort.getQuantFile(), pc);

    jobControl = jcc.compile(mrPlan, query);
    Util.copyFromLocalToCluster(cluster, "test/org/apache/pig/test/data/passwd", ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());

    //First job is just foreach with projection, mapper-only job, so estimate gets ignored
    Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf());

    jcc.updateMROpPlan(jobControl.getReadyJobs());
    jobControl = jcc.compile(mrPlan, query);
    jcc.updateMROpPlan(jobControl.getReadyJobs());

    //Second job is a sampler, which requests and gets 1 reducer
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    jobControl = jcc.compile(mrPlan, query);
    sort = mrPlan.getLeaves().get(0);
    assertEquals(reducer, sort.getRequestedParallelism());

    //Third job is the order, which uses the estimated number of reducers
    Util.assertParallelValues(-1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
}
 
Example #16
Source File: TestJobSubmission.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testReducerNumEstimation() throws Exception{
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ",
            Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    Configuration conf = HBaseConfiguration.create(new Configuration());
    HBaseTestingUtility util = new HBaseTestingUtility(conf);
    int clientPort = util.startMiniZKCluster().getClientPort();
    util.startMiniHBaseCluster(1, 1);

    String query = "a = load '/passwd';" +
            "b = group a by $0;" +
            "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    pc.getConf().setProperty(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.toString(clientPort));
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jc=jcc.compile(mrPlan, "Test");
    Job job = jc.getWaitingJobs().get(0);
    long reducer=Math.min((long)Math.ceil(new File("test/org/apache/pig/test/data/passwd").length()/100.0), 10);

    Util.assertParallelValues(-1, -1, reducer, reducer, job.getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" +
            "b = group a by $0 PARALLEL 2;" +
            "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");
    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc=jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, 2, -1, 2, job.getJobConf());

    final byte[] COLUMNFAMILY = Bytes.toBytes("pig");
    util.createTable(Bytes.toBytesBinary("test_table"), COLUMNFAMILY);

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as hbase
    query = "a = load 'hbase://test_table' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" +
            "b = group a by $0 ;" +
            "store b into 'output';";
    pp = Util.buildPp(ps, query);
    mrPlan = Util.buildMRPlan(pp, pc);

    pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getConf().setProperty("pig.exec.reducers.max", "10");

    ConfigurationValidator.validatePigProperties(pc.getProperties());
    conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    jcc = new JobControlCompiler(pc, conf);
    jc=jcc.compile(mrPlan, "Test");
    job = jc.getWaitingJobs().get(0);

    Util.assertParallelValues(-1, -1, -1, 1, job.getJobConf());

    util.deleteTable(Bytes.toBytesBinary("test_table"));
    // In HBase 0.90.1 and above we can use util.shutdownMiniHBaseCluster()
    // here instead.
    MiniHBaseCluster hbc = util.getHBaseCluster();
    if (hbc != null) {
        hbc.shutdown();
        hbc.join();
    }
    util.shutdownMiniZKCluster();
}
 
Example #17
Source File: ValueAggregatorJob.java    From big-c with Apache License 2.0 4 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]) throws IOException {
  return createValueAggregatorJobs(args, null);
}
 
Example #18
Source File: ValueAggregatorJob.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public static JobControl createValueAggregatorJobs(String args[]) throws IOException {
  return createValueAggregatorJobs(args, null);
}