com.amazonaws.services.elasticmapreduce.model.StepConfig Java Examples

The following examples show how to use com.amazonaws.services.elasticmapreduce.model.StepConfig. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: EmrClusterJob.java From datacollector with Apache License 2.0

6 votes

@Override
public Properties submitJob(Properties jobProps) throws IOException {
  EMRJobConfig emrJobConfig = new EMRJobConfig(jobProps);
  Utils.checkNotNull(emrJobConfig.getClusterId(), "EMR Cluster Id");
  StepConfig stepConfig = new StepConfig()
      .withName(emrJobConfig.getJobName())
      .withActionOnFailure(ActionOnFailure.CONTINUE) // check if action on failure needs to be configurable
      .withHadoopJarStep(new HadoopJarStepConfig()
          .withJar(emrJobConfig.getDriverJarPath())
          .withMainClass(emrJobConfig.getDriverMainClass()).withArgs(
              emrJobConfig.getArchives(),
              emrJobConfig.getLibjars(),
              emrJobConfig.getUniquePrefix(),
              emrJobConfig.getJavaOpts(),
              emrJobConfig.getLogLevel()
          ));
  LOG.debug("Step config is {}", stepConfig.toString());
  AddJobFlowStepsResult addJobFlowStepsResult = getEmrClient(emrClusterConfig).addJobFlowSteps(
      new AddJobFlowStepsRequest()
          .withJobFlowId(emrJobConfig.getClusterId())
          .withSteps(stepConfig));
  String stepId = addJobFlowStepsResult.getStepIds().get(0);
  jobProps.setProperty("stepId", stepId);
  return jobProps;
}

Example #2

Source File: MockEmrOperationsImpl.java From herd with Apache License 2.0

6 votes

private MockEmrJobFlow createNewCluster(RunJobFlowRequest jobFlowRequest, String status, StatusChangeReason reason, StatusTimeline timeline)
{
    MockEmrJobFlow cluster = new MockEmrJobFlow();
    cluster.setJobFlowId(getNewJobFlowId());
    cluster.setJobFlowName(jobFlowRequest.getName());
    cluster.setStatus(status);
    cluster.setStatusTimeline(timeline);
    cluster.setStatusChangeReason(reason);
    emrClusters.put(cluster.getJobFlowId(), cluster);

    // Add the steps
    for (StepConfig stepConfig : jobFlowRequest.getSteps())
    {
        addClusterStep(cluster.getJobFlowId(), stepConfig);
    }

    return cluster;
}

Example #3

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigWithArguments()
{
    List<String> arguments = new ArrayList<>();
    arguments.add("arg1");

    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, arguments, false);

    assertNotNull("step not returned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
    assertNotNull("arguments not found", stepConfig.getHadoopJarStep().getArgs());
}

Example #4

Source File: LambdaContainer.java From aws-big-data-blog with Apache License 2.0

5 votes

protected String fireEMRJob(String paramsStr,String clusterId){
	StepFactory stepFactory = new StepFactory();
	AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
	emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
	Application sparkConfig = new Application()
			.withName("Spark");
	
	String[] params = paramsStr.split(",");
	StepConfig enabledebugging = new StepConfig()
			.withName("Enable debugging")
			.withActionOnFailure("TERMINATE_JOB_FLOW")
			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
	
	HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
			.withJar("command-runner.jar")
			.withArgs(params);	
	
	final StepConfig sparkStep = new StepConfig()
			.withName("Spark Step")
			.withActionOnFailure("CONTINUE")
			.withHadoopJarStep(sparkStepConf);

	
	AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
			.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
			

	AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
	return result.getStepIds().get(0);
}

Example #5

Source File: EMRUtils.java From aws-big-data-blog with Apache License 2.0

5 votes

/**
 * This is a helper method for creating step configuration information
 * @param stepName - a custom name to label this step
 * @param actionOnFailure - options are terminate cluster, terminate job flow, contiunue
 * @param jarPath - path to jar file - could be on S3 or  local file system
 * @param args list of Java args to configure custom step
 * @return
 */
private static StepConfig createStepConfig(String stepName, String actionOnFailure, String jarPath, List<String> args ) {
	//Start HBase step - after installing it with a bootstrap action
			StepConfig stepConfig = new StepConfig()
				.withName(stepName)
				.withActionOnFailure(actionOnFailure)
				.withHadoopJarStep(new HadoopJarStepConfig()
									.withJar(jarPath)
									.withArgs(args));
			return stepConfig;
}

Example #6

Source File: EmrClusterJob.java From datacollector with Apache License 2.0

5 votes

@Override
public String createCluster(String clusterName) {
  RunJobFlowRequest request = new RunJobFlowRequest()
      .withName(clusterName)
      .withReleaseLabel(EmrInfo.getVersion())
      .withServiceRole(emrClusterConfig.getServiceRole())
      .withJobFlowRole(emrClusterConfig.getJobFlowRole())
      .withVisibleToAllUsers(emrClusterConfig.isVisibleToAllUsers())
      .withInstances(new JobFlowInstancesConfig()
          .withEc2SubnetId(emrClusterConfig.getEc2SubnetId())
          .withEmrManagedMasterSecurityGroup(emrClusterConfig.getMasterSecurityGroup())
          .withEmrManagedSlaveSecurityGroup(emrClusterConfig.getSlaveSecurityGroup())
          .withInstanceCount(emrClusterConfig.getInstanceCount())
          .withKeepJobFlowAliveWhenNoSteps(true)
          .withMasterInstanceType(emrClusterConfig.getMasterInstanceType())
          .withSlaveInstanceType(emrClusterConfig.getSlaveInstanceType()));

  if (emrClusterConfig.isLoggingEnabled()) {
    request.withLogUri(emrClusterConfig.getS3LogUri());
    if (emrClusterConfig.isEnableEmrDebugging()) {
      String COMMAND_RUNNER = "command-runner.jar";
      String DEBUGGING_COMMAND = "state-pusher-script";
      String DEBUGGING_NAME = "Setup Hadoop Debugging";
      StepConfig enabledebugging = new StepConfig()
          .withName(DEBUGGING_NAME)
          .withActionOnFailure(ActionOnFailure.CONTINUE)
          .withHadoopJarStep(new HadoopJarStepConfig()
              .withJar(COMMAND_RUNNER)
              .withArgs(DEBUGGING_COMMAND));
      request.withSteps(enabledebugging);
    }
  }
  RunJobFlowResult result = getEmrClient(emrClusterConfig).runJobFlow(request);
  return result.getJobFlowId();
}

Example #7

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

5 votes

private StepConfig stepConfig(String defaultName, String tag, Config step)
{
    String name = step.get("name", String.class, defaultName);
    return new StepConfig()
            .withName(name + " (" + tag + ")")
            // TERMINATE_JOB_FLOW | TERMINATE_CLUSTER | CANCEL_AND_WAIT | CONTINUE
            .withActionOnFailure(step.get("action_on_failure", String.class, defaultActionOnFailure));
}

Example #8

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

5 votes

private void addStep(String name, CommandRunnerConfiguration configuration)
        throws IOException
{
    FileReference configurationFileReference = ImmutableFileReference.builder()
            .type(FileReference.Type.DIRECT)
            .contents(objectMapper.writeValueAsBytes(configuration))
            .filename("config.json")
            .build();
    RemoteFile remoteConfigurationFile = prepareRemoteFile(configurationFileReference, false);

    StepConfig runStep = stepConfig(name, tag, step)
            .withHadoopJarStep(stepFactory().newScriptRunnerStep(runner.s3Uri().toString(), remoteConfigurationFile.s3Uri().toString()));

    configs.add(runStep);
}

Example #9

Source File: MockEmrOperationsImpl.java From herd with Apache License 2.0

5 votes

private MockEmrJobFlow addClusterStep(String jobFlowId, StepConfig step)
{
    List<MockEmrJobFlow> mockSteps = getStepsByClusterId(jobFlowId);
    if (mockSteps == null)
    {
        mockSteps = new ArrayList<>();
    }

    MockEmrJobFlow mockStep = new MockEmrJobFlow();
    if (!step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setJobFlowId(getNewJobFlowId());
    }
    mockStep.setJobFlowName(step.getName());
    if (step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_NAME) || step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setStatus(StepState.RUNNING.toString());
    }
    else
    {
        mockStep.setStatus(StepState.PENDING.toString());
    }
    mockStep.setJarLocation(step.getHadoopJarStep().getJar());

    mockSteps.add(mockStep);
    setStepsByClusterId(jobFlowId, mockSteps);
    return mockStep;
}

Example #10

Source File: EmrPigStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrPigStep pigStep = (EmrPigStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (pigStep.isContinueOnError() != null && pigStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(pigStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunPigScriptStep(pigStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(new StepFactory()
            .newRunPigScriptStep(pigStep.getScriptLocation().trim(),
                pigStep.getScriptArguments().toArray(new String[pigStep.getScriptArguments().size()])));
    }
}

Example #11

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, true);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Example #12

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigNoContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, null);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Example #13

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfig()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, false);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Example #14

Source File: EmrDaoImpl.java From herd with Apache License 2.0

5 votes

/**
 * Create the step config list of objects for hive/pig installation.
 *
 * @param emrClusterDefinition the EMR definition name value.
 *
 * @return list of step configuration that contains all the steps for the given configuration.
 */
private List<StepConfig> getStepConfig(EmrClusterDefinition emrClusterDefinition)
{
    StepFactory stepFactory = new StepFactory();
    List<StepConfig> appSteps = new ArrayList<>();

    // Create install hive step and add to the StepConfig list
    if (StringUtils.isNotBlank(emrClusterDefinition.getHiveVersion()))
    {
        StepConfig installHive =
            new StepConfig().withName("Hive " + emrClusterDefinition.getHiveVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallHiveStep(emrClusterDefinition.getHiveVersion()));
        appSteps.add(installHive);
    }

    // Create install Pig step and add to the StepConfig List
    if (StringUtils.isNotBlank(emrClusterDefinition.getPigVersion()))
    {
        StepConfig installPig =
            new StepConfig().withName("Pig " + emrClusterDefinition.getPigVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallPigStep(emrClusterDefinition.getPigVersion()));
        appSteps.add(installPig);
    }

    // Add the hadoop jar steps that need to be added.
    if (!CollectionUtils.isEmpty(emrClusterDefinition.getHadoopJarSteps()))
    {
        for (HadoopJarStep hadoopJarStep : emrClusterDefinition.getHadoopJarSteps())
        {
            StepConfig stepConfig = emrHelper
                .getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
                    hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());

            appSteps.add(stepConfig);
        }
    }

    return appSteps;
}

Example #15

Source File: EmrDaoImpl.java From herd with Apache License 2.0

5 votes

@Override
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception
{
    List<StepConfig> steps = new ArrayList<>();

    steps.add(emrStepConfig);

    // Add the job flow request
    AddJobFlowStepsRequest jobFlowStepRequest = new AddJobFlowStepsRequest(clusterId, steps);
    List<String> emrStepIds = emrOperations.addJobFlowStepsRequest(getEmrClient(awsParamsDto), jobFlowStepRequest);

    return emrStepIds.get(0);
}

Example #16

Source File: EmrHelper.java From herd with Apache License 2.0

5 votes

/**
 * Builds the StepConfig for the Hadoop jar step.
 *
 * @param stepName the step name.
 * @param jarLocation the location of jar.
 * @param mainClass the main class.
 * @param scriptArguments the arguments.
 * @param isContinueOnError indicate what to do on error.
 *
 * @return the stepConfig.
 */
public StepConfig getEmrHadoopJarStepConfig(String stepName, String jarLocation, String mainClass, List<String> scriptArguments, Boolean isContinueOnError)
{
    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (isContinueOnError != null && isContinueOnError)
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments
    if (CollectionUtils.isEmpty(scriptArguments))
    {
        // Build the StepConfig object and return
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass));
    }
    else
    {
        // If there are arguments, include the arguments in the StepConfig object
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass)
                .withArgs(scriptArguments.toArray(new String[scriptArguments.size()])));
    }
}

Example #17

Source File: EmrShellStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrShellStep emrShellStep = (EmrShellStep) step;

    // Hadoop Jar provided by Amazon for running Shell Scripts
    String hadoopJarForShellScript = configurationHelper.getProperty(ConfigurationValue.EMR_SHELL_SCRIPT_JAR_PATH);

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;
    if (emrShellStep.isContinueOnError() != null && emrShellStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // Add the script location
    List<String> argsList = new ArrayList<>();
    argsList.add(emrShellStep.getScriptLocation().trim());

    // Add the script arguments
    if (!CollectionUtils.isEmpty(emrShellStep.getScriptArguments()))
    {
        for (String argument : emrShellStep.getScriptArguments())
        {
            argsList.add(argument.trim());
        }
    }

    // Return the StepConfig object
    HadoopJarStepConfig jarConfig = new HadoopJarStepConfig(hadoopJarForShellScript).withArgs(argsList);
    return new StepConfig().withName(emrShellStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(jarConfig);
}

Example #18

Source File: EmrHadoopJarStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHadoopJarStep hadoopJarStep = (EmrHadoopJarStep) step;

    return emrHelper.getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
        hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());
}

Example #19

Source File: EmrHiveStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHiveStep emrHiveStep = (EmrHiveStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (emrHiveStep.isContinueOnError() != null && emrHiveStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(emrHiveStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        // For each argument, add "-d" option
        List<String> hiveArgs = new ArrayList<>();
        for (String hiveArg : emrHiveStep.getScriptArguments())
        {
            hiveArgs.add("-d");
            hiveArgs.add(hiveArg);
        }
        // Return the StepConfig object
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim(), hiveArgs.toArray(new String[hiveArgs.size()])));
    }
}

Example #20

Source File: EmrDaoTest.java From herd with Apache License 2.0

4 votes

@Test
public void createEmrClusterAssertCallRunEmrJobFlowWithInstanceFleetAndMultipleSubnets()
{
    // Create objects required for testing.
    final String clusterName = "clusterName";
    final String clusterId = "clusterId";
    final String name = STRING_VALUE;
    final String instanceFleetType = STRING_VALUE_2;
    final Integer targetOnDemandCapacity = INTEGER_VALUE;
    final Integer targetSpotCapacity = INTEGER_VALUE_2;
    final List<EmrClusterDefinitionInstanceTypeConfig> emrClusterDefinitionInstanceTypeConfigs = null;
    final EmrClusterDefinitionLaunchSpecifications emrClusterDefinitionLaunchSpecifications = null;
    final EmrClusterDefinitionInstanceFleet emrClusterDefinitionInstanceFleet =
        new EmrClusterDefinitionInstanceFleet(name, instanceFleetType, targetOnDemandCapacity, targetSpotCapacity, emrClusterDefinitionInstanceTypeConfigs,
            emrClusterDefinitionLaunchSpecifications);

    // Create an EMR cluster definition with instance fleet configuration and multiple EC2 subnet IDs.
    EmrClusterDefinition emrClusterDefinition = new EmrClusterDefinition();
    emrClusterDefinition.setInstanceFleets(Lists.newArrayList(emrClusterDefinitionInstanceFleet));
    emrClusterDefinition.setSubnetId(String.format("%s , %s  ", EC2_SUBNET, EC2_SUBNET_2));
    emrClusterDefinition.setNodeTags(Lists.newArrayList(new NodeTag("tagName", "tagValue")));

    when(mockEmrOperations.runEmrJobFlow(any(), any())).then(new Answer<String>()
    {
        @Override
        public String answer(InvocationOnMock invocation)
        {
            // Assert that the given EMR cluster definition produced the correct RunJobFlowRequest.
            RunJobFlowRequest runJobFlowRequest = invocation.getArgument(1);
            JobFlowInstancesConfig jobFlowInstancesConfig = runJobFlowRequest.getInstances();
            assertEquals(0, CollectionUtils.size(jobFlowInstancesConfig.getInstanceGroups()));
            final List<InstanceTypeConfig> expectedInstanceTypeConfigs = null;
            assertEquals(Lists.newArrayList(
                new InstanceFleetConfig().withName(name).withInstanceFleetType(instanceFleetType).withTargetOnDemandCapacity(targetOnDemandCapacity)
                    .withTargetSpotCapacity(targetSpotCapacity).withInstanceTypeConfigs(expectedInstanceTypeConfigs).withLaunchSpecifications(null)),
                jobFlowInstancesConfig.getInstanceFleets());
            assertNull(jobFlowInstancesConfig.getEc2SubnetId());
            assertEquals(2, CollectionUtils.size(jobFlowInstancesConfig.getEc2SubnetIds()));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET_2));
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_EC2_NODE_IAM_PROFILE_NAME),
                runJobFlowRequest.getJobFlowRole());
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_SERVICE_IAM_ROLE_NAME),
                runJobFlowRequest.getServiceRole());
            List<StepConfig> stepConfigs = runJobFlowRequest.getSteps();
            assertEquals(0, stepConfigs.size());
            List<Tag> tags = runJobFlowRequest.getTags();
            assertEquals(1, tags.size());
            {
                Tag tag = tags.get(0);
                assertEquals("tagName", tag.getKey());
                assertEquals("tagValue", tag.getValue());
            }

            return clusterId;
        }
    });

    assertEquals(clusterId, emrDao.createEmrCluster(clusterName, emrClusterDefinition, getAwsParamsDto()));
}

Example #21

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

4 votes

private NewCluster submitNewClusterRequest(AmazonElasticMapReduce emr, String tag, StepCompiler stepCompiler,
        Config cluster, Filer filer, ParameterCompiler parameterCompiler)
        throws IOException
{
    RemoteFile runner = prepareRunner(filer, tag);

    // Compile steps
    stepCompiler.compile(runner);

    List<StepConfig> stepConfigs = stepCompiler.stepConfigs();

    Config ec2 = cluster.getNested("ec2");
    Config master = ec2.getNestedOrGetEmpty("master");
    List<Config> core = ec2.getOptional("core", Config.class).transform(ImmutableList::of).or(ImmutableList.of());
    List<Config> task = ec2.getListOrEmpty("task", Config.class);

    List<String> applications = cluster.getListOrEmpty("applications", String.class);
    if (applications.isEmpty()) {
        applications = ImmutableList.of("Hadoop", "Hive", "Spark", "Flink");
    }

    // TODO: allow configuring additional application parameters
    List<Application> applicationConfigs = applications.stream()
            .map(application -> new Application().withName(application))
            .collect(toList());

    // TODO: merge configurations with the same classification?
    List<Configuration> configurations = cluster.getListOrEmpty("configurations", JsonNode.class).stream()
            .map(this::configurations)
            .flatMap(Collection::stream)
            .collect(toList());

    List<JsonNode> bootstrap = cluster.getListOrEmpty("bootstrap", JsonNode.class);
    List<BootstrapActionConfig> bootstrapActions = new ArrayList<>();
    for (int i = 0; i < bootstrap.size(); i++) {
        bootstrapActions.add(bootstrapAction(i + 1, bootstrap.get(i), tag, filer, runner, parameterCompiler));
    }

    // Stage files to S3
    filer.stageFiles();

    Optional<String> subnetId = ec2.getOptional("subnet_id", String.class);

    String defaultMasterInstanceType;
    String defaultCoreInstanceType;
    String defaultTaskInstanceType;

    if (subnetId.isPresent()) {
        // m4 requires VPC (subnet id)
        defaultMasterInstanceType = "m4.2xlarge";
        defaultCoreInstanceType = "m4.xlarge";
        defaultTaskInstanceType = "m4.xlarge";
    }
    else {
        defaultMasterInstanceType = "m3.2xlarge";
        defaultCoreInstanceType = "m3.xlarge";
        defaultTaskInstanceType = "m3.xlarge";
    }

    RunJobFlowRequest request = new RunJobFlowRequest()
            .withName(cluster.get("name", String.class, "Digdag") + " (" + tag + ")")
            .withReleaseLabel(cluster.get("release", String.class, "emr-5.2.0"))
            .withSteps(stepConfigs)
            .withBootstrapActions(bootstrapActions)
            .withApplications(applicationConfigs)
            .withLogUri(cluster.get("logs", String.class, null))
            .withJobFlowRole(cluster.get("cluster_role", String.class, "EMR_EC2_DefaultRole"))
            .withServiceRole(cluster.get("service_role", String.class, "EMR_DefaultRole"))
            .withTags(new Tag().withKey("DIGDAG_CLUSTER_ID").withValue(tag))
            .withVisibleToAllUsers(cluster.get("visible", boolean.class, true))
            .withConfigurations(configurations)
            .withInstances(new JobFlowInstancesConfig()
                    .withInstanceGroups(ImmutableList.<InstanceGroupConfig>builder()
                            // Master Node
                            .add(instanceGroupConfig("Master", master, "MASTER", defaultMasterInstanceType, 1))
                            // Core Group
                            .addAll(instanceGroupConfigs("Core", core, "CORE", defaultCoreInstanceType))
                            // Task Groups
                            .addAll(instanceGroupConfigs("Task %d", task, "TASK", defaultTaskInstanceType))
                            .build()
                    )
                    .withAdditionalMasterSecurityGroups(ec2.getListOrEmpty("additional_master_security_groups", String.class))
                    .withAdditionalSlaveSecurityGroups(ec2.getListOrEmpty("additional_slave_security_groups", String.class))
                    .withEmrManagedMasterSecurityGroup(ec2.get("emr_managed_master_security_group", String.class, null))
                    .withEmrManagedSlaveSecurityGroup(ec2.get("emr_managed_slave_security_group", String.class, null))
                    .withServiceAccessSecurityGroup(ec2.get("service_access_security_group", String.class, null))
                    .withTerminationProtected(cluster.get("termination_protected", boolean.class, false))
                    .withPlacement(cluster.getOptional("availability_zone", String.class)
                            .transform(zone -> new PlacementType().withAvailabilityZone(zone)).orNull())
                    .withEc2SubnetId(subnetId.orNull())
                    .withEc2KeyName(ec2.get("key", String.class))
                    .withKeepJobFlowAliveWhenNoSteps(!cluster.get("auto_terminate", boolean.class, true)));

    logger.info("Submitting EMR job with {} steps(s)", request.getSteps().size());
    RunJobFlowResult result = emr.runJobFlow(request);
    logger.info("Submitted EMR job with {} step(s): {}", request.getSteps().size(), result.getJobFlowId(), result);

    return NewCluster.of(result.getJobFlowId(), request.getSteps().size());
}

Example #22

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

4 votes

List<StepConfig> stepConfigs()
{
    Preconditions.checkState(configs != null);
    return configs;
}

Example #23

Source File: EMRUtils.java From aws-big-data-blog with Apache License 2.0

4 votes

/**
 * This method uses method the AWS Java to launch an Apache HBase cluster on Amazon EMR. 
 * 
 * @param client - AmazonElasticMapReduce client that interfaces directly with the Amazon EMR Web Service
 * @param clusterIdentifier - identifier of an existing cluster
 * @param amiVersion - AMI to use for launching this cluster
 * @param keypair - A keypair for SSHing into the Amazon EMR master node
 * @param masterInstanceType - Master node Amazon EC2 instance type 
 * @param coreInstanceType - core nodes Amazon EC2 instance type 
 * @param logUri - An Amazon S3 bucket for your 
 * @param numberOfNodes - total number of nodes in this cluster including master node
 * @return
 */
public static String createCluster(AmazonElasticMapReduce client,
		String clusterIdentifier,
		String amiVersion,
		String keypair,
		String masterInstanceType,
		String coreInstanceType,
		String logUri,
		int numberOfNodes) {

	if (clusterExists(client, clusterIdentifier)) {
		LOG.info("Cluster " + clusterIdentifier + " is available");
		return clusterIdentifier;
	}
	
	//Error checking
	if (amiVersion == null || amiVersion.isEmpty()) throw new RuntimeException("ERROR: Please specify an AMI Version");
	if (keypair == null || keypair.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon Key Pair");
	if (masterInstanceType == null || masterInstanceType.isEmpty()) throw new RuntimeException("ERROR: Please specify a Master Instance Type");
	if (logUri == null || logUri.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon S3 bucket for your logs.");
	if (numberOfNodes < 0) throw new RuntimeException("ERROR: Please specify at least 1 node");
	  		
	  RunJobFlowRequest request = new RunJobFlowRequest()
	    .withAmiVersion(amiVersion)
		.withBootstrapActions(new BootstrapActionConfig()
		             .withName("Install HBase")
		             .withScriptBootstrapAction(new ScriptBootstrapActionConfig()
		             .withPath("s3://elasticmapreduce/bootstrap-actions/setup-hbase")))
		.withName("Job Flow With HBAse Actions")	 
		.withSteps(new StepConfig() //enable debugging step
					.withName("Enable debugging")
					.withActionOnFailure("TERMINATE_CLUSTER")
					.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()), 
					//Start HBase step - after installing it with a bootstrap action
					createStepConfig("Start HBase","TERMINATE_CLUSTER", "/home/hadoop/lib/hbase.jar", getHBaseArgs()), 
					//add HBase backup step
					createStepConfig("Modify backup schedule","TERMINATE_JOB_FLOW", "/home/hadoop/lib/hbase.jar", getHBaseBackupArgs()))
		.withLogUri(logUri)
		.withInstances(new JobFlowInstancesConfig()
		.withEc2KeyName(keypair)
		.withInstanceCount(numberOfNodes)
		.withKeepJobFlowAliveWhenNoSteps(true)
		.withMasterInstanceType(masterInstanceType)
		.withSlaveInstanceType(coreInstanceType));

	RunJobFlowResult result = client.runJobFlow(request);
	
	String state = null;
	while (!(state = clusterState(client, result.getJobFlowId())).equalsIgnoreCase("waiting")) {
		try {
			Thread.sleep(10 * 1000);
			LOG.info(result.getJobFlowId() + " is " + state + ". Waiting for cluster to become available.");
		} catch (InterruptedException e) {

		}
		
		if (state.equalsIgnoreCase("TERMINATED_WITH_ERRORS")){
			LOG.error("Could not create EMR Cluster");
			System.exit(-1);	
		}
	}
	LOG.info("Created cluster " + result.getJobFlowId());
	LOG.info("Cluster " + clusterIdentifier + " is available");	
	return result.getJobFlowId();
}

Example #24

Source File: EmrDao.java From herd with Apache License 2.0

2 votes

/**
 * Add an EMR Step. This method adds the step to EMR cluster based on the input.
 *
 * @param clusterId EMR cluster ID.
 * @param emrStepConfig the EMR step config to be added.
 * @param awsParamsDto the proxy details.
 * <p/>
 * There are four serializable objects supported currently. They are 1: ShellStep - For shell scripts 2: HiveStep - For hive scripts 3: HadoopJarStep - For
 * Custom Map Reduce Jar files and 4: PigStep - For Pig scripts.
 *
 * @return the step id
 */
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception;

Example #25

Source File: EmrStepHelper.java From herd with Apache License 2.0

2 votes

/**
 * This method gets the StepConfig object for the given Step.
 *
 * @param step the step object
 *
 * @return the step config object
 */
public abstract StepConfig getEmrStepConfig(Object step);