com.amazonaws.services.elasticmapreduce.model.StepConfig Java Exaples

Source File: EmrClusterJob.java From datacollector with Apache License 2.0

6 votes

@Override
public Properties submitJob(Properties jobProps) throws IOException {
  EMRJobConfig emrJobConfig = new EMRJobConfig(jobProps);
  Utils.checkNotNull(emrJobConfig.getClusterId(), "EMR Cluster Id");
  StepConfig stepConfig = new StepConfig()
      .withName(emrJobConfig.getJobName())
      .withActionOnFailure(ActionOnFailure.CONTINUE) // check if action on failure needs to be configurable
      .withHadoopJarStep(new HadoopJarStepConfig()
          .withJar(emrJobConfig.getDriverJarPath())
          .withMainClass(emrJobConfig.getDriverMainClass()).withArgs(
              emrJobConfig.getArchives(),
              emrJobConfig.getLibjars(),
              emrJobConfig.getUniquePrefix(),
              emrJobConfig.getJavaOpts(),
              emrJobConfig.getLogLevel()
          ));
  LOG.debug("Step config is {}", stepConfig.toString());
  AddJobFlowStepsResult addJobFlowStepsResult = getEmrClient(emrClusterConfig).addJobFlowSteps(
      new AddJobFlowStepsRequest()
          .withJobFlowId(emrJobConfig.getClusterId())
          .withSteps(stepConfig));
  String stepId = addJobFlowStepsResult.getStepIds().get(0);
  jobProps.setProperty("stepId", stepId);
  return jobProps;
}

Source File: MockEmrOperationsImpl.java From herd with Apache License 2.0

6 votes

private MockEmrJobFlow createNewCluster(RunJobFlowRequest jobFlowRequest, String status, StatusChangeReason reason, StatusTimeline timeline)
{
    MockEmrJobFlow cluster = new MockEmrJobFlow();
    cluster.setJobFlowId(getNewJobFlowId());
    cluster.setJobFlowName(jobFlowRequest.getName());
    cluster.setStatus(status);
    cluster.setStatusTimeline(timeline);
    cluster.setStatusChangeReason(reason);
    emrClusters.put(cluster.getJobFlowId(), cluster);

    // Add the steps
    for (StepConfig stepConfig : jobFlowRequest.getSteps())
    {
        addClusterStep(cluster.getJobFlowId(), stepConfig);
    }

    return cluster;
}

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigWithArguments()
{
    List<String> arguments = new ArrayList<>();
    arguments.add("arg1");

    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, arguments, false);

    assertNotNull("step not returned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
    assertNotNull("arguments not found", stepConfig.getHadoopJarStep().getArgs());
}

Source File: LambdaContainer.java From aws-big-data-blog with Apache License 2.0

5 votes

protected String fireEMRJob(String paramsStr,String clusterId){
	StepFactory stepFactory = new StepFactory();
	AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
	emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
	Application sparkConfig = new Application()
			.withName("Spark");
	
	String[] params = paramsStr.split(",");
	StepConfig enabledebugging = new StepConfig()
			.withName("Enable debugging")
			.withActionOnFailure("TERMINATE_JOB_FLOW")
			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
	
	HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
			.withJar("command-runner.jar")
			.withArgs(params);	
	
	final StepConfig sparkStep = new StepConfig()
			.withName("Spark Step")
			.withActionOnFailure("CONTINUE")
			.withHadoopJarStep(sparkStepConf);

	
	AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
			.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
			

	AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
	return result.getStepIds().get(0);
}

Source File: EMRUtils.java From aws-big-data-blog with Apache License 2.0

5 votes

/**
 * This is a helper method for creating step configuration information
 * @param stepName - a custom name to label this step
 * @param actionOnFailure - options are terminate cluster, terminate job flow, contiunue
 * @param jarPath - path to jar file - could be on S3 or  local file system
 * @param args list of Java args to configure custom step
 * @return
 */
private static StepConfig createStepConfig(String stepName, String actionOnFailure, String jarPath, List<String> args ) {
	//Start HBase step - after installing it with a bootstrap action
			StepConfig stepConfig = new StepConfig()
				.withName(stepName)
				.withActionOnFailure(actionOnFailure)
				.withHadoopJarStep(new HadoopJarStepConfig()
									.withJar(jarPath)
									.withArgs(args));
			return stepConfig;
}

Source File: EmrClusterJob.java From datacollector with Apache License 2.0

5 votes

@Override
public String createCluster(String clusterName) {
  RunJobFlowRequest request = new RunJobFlowRequest()
      .withName(clusterName)
      .withReleaseLabel(EmrInfo.getVersion())
      .withServiceRole(emrClusterConfig.getServiceRole())
      .withJobFlowRole(emrClusterConfig.getJobFlowRole())
      .withVisibleToAllUsers(emrClusterConfig.isVisibleToAllUsers())
      .withInstances(new JobFlowInstancesConfig()
          .withEc2SubnetId(emrClusterConfig.getEc2SubnetId())
          .withEmrManagedMasterSecurityGroup(emrClusterConfig.getMasterSecurityGroup())
          .withEmrManagedSlaveSecurityGroup(emrClusterConfig.getSlaveSecurityGroup())
          .withInstanceCount(emrClusterConfig.getInstanceCount())
          .withKeepJobFlowAliveWhenNoSteps(true)
          .withMasterInstanceType(emrClusterConfig.getMasterInstanceType())
          .withSlaveInstanceType(emrClusterConfig.getSlaveInstanceType()));

  if (emrClusterConfig.isLoggingEnabled()) {
    request.withLogUri(emrClusterConfig.getS3LogUri());
    if (emrClusterConfig.isEnableEmrDebugging()) {
      String COMMAND_RUNNER = "command-runner.jar";
      String DEBUGGING_COMMAND = "state-pusher-script";
      String DEBUGGING_NAME = "Setup Hadoop Debugging";
      StepConfig enabledebugging = new StepConfig()
          .withName(DEBUGGING_NAME)
          .withActionOnFailure(ActionOnFailure.CONTINUE)
          .withHadoopJarStep(new HadoopJarStepConfig()
              .withJar(COMMAND_RUNNER)
              .withArgs(DEBUGGING_COMMAND));
      request.withSteps(enabledebugging);
    }
  }
  RunJobFlowResult result = getEmrClient(emrClusterConfig).runJobFlow(request);
  return result.getJobFlowId();
}

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

5 votes

private StepConfig stepConfig(String defaultName, String tag, Config step)
{
    String name = step.get("name", String.class, defaultName);
    return new StepConfig()
            .withName(name + " (" + tag + ")")
            // TERMINATE_JOB_FLOW | TERMINATE_CLUSTER | CANCEL_AND_WAIT | CONTINUE
            .withActionOnFailure(step.get("action_on_failure", String.class, defaultActionOnFailure));
}

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

5 votes

private void addStep(String name, CommandRunnerConfiguration configuration)
        throws IOException
{
    FileReference configurationFileReference = ImmutableFileReference.builder()
            .type(FileReference.Type.DIRECT)
            .contents(objectMapper.writeValueAsBytes(configuration))
            .filename("config.json")
            .build();
    RemoteFile remoteConfigurationFile = prepareRemoteFile(configurationFileReference, false);

    StepConfig runStep = stepConfig(name, tag, step)
            .withHadoopJarStep(stepFactory().newScriptRunnerStep(runner.s3Uri().toString(), remoteConfigurationFile.s3Uri().toString()));

    configs.add(runStep);
}

Source File: MockEmrOperationsImpl.java From herd with Apache License 2.0

5 votes

private MockEmrJobFlow addClusterStep(String jobFlowId, StepConfig step)
{
    List<MockEmrJobFlow> mockSteps = getStepsByClusterId(jobFlowId);
    if (mockSteps == null)
    {
        mockSteps = new ArrayList<>();
    }

    MockEmrJobFlow mockStep = new MockEmrJobFlow();
    if (!step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setJobFlowId(getNewJobFlowId());
    }
    mockStep.setJobFlowName(step.getName());
    if (step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_NAME) || step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setStatus(StepState.RUNNING.toString());
    }
    else
    {
        mockStep.setStatus(StepState.PENDING.toString());
    }
    mockStep.setJarLocation(step.getHadoopJarStep().getJar());

    mockSteps.add(mockStep);
    setStepsByClusterId(jobFlowId, mockSteps);
    return mockStep;
}

Source File: EmrPigStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrPigStep pigStep = (EmrPigStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (pigStep.isContinueOnError() != null && pigStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(pigStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunPigScriptStep(pigStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(new StepFactory()
            .newRunPigScriptStep(pigStep.getScriptLocation().trim(),
                pigStep.getScriptArguments().toArray(new String[pigStep.getScriptArguments().size()])));
    }
}

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, true);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfigNoContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, null);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Source File: EmrHelperTest.java From herd with Apache License 2.0

5 votes

@Test
public void testEmrHadoopJarStepConfig()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, false);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}

Source File: EmrDaoImpl.java From herd with Apache License 2.0

5 votes

/**
 * Create the step config list of objects for hive/pig installation.
 *
 * @param emrClusterDefinition the EMR definition name value.
 *
 * @return list of step configuration that contains all the steps for the given configuration.
 */
private List<StepConfig> getStepConfig(EmrClusterDefinition emrClusterDefinition)
{
    StepFactory stepFactory = new StepFactory();
    List<StepConfig> appSteps = new ArrayList<>();

    // Create install hive step and add to the StepConfig list
    if (StringUtils.isNotBlank(emrClusterDefinition.getHiveVersion()))
    {
        StepConfig installHive =
            new StepConfig().withName("Hive " + emrClusterDefinition.getHiveVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallHiveStep(emrClusterDefinition.getHiveVersion()));
        appSteps.add(installHive);
    }

    // Create install Pig step and add to the StepConfig List
    if (StringUtils.isNotBlank(emrClusterDefinition.getPigVersion()))
    {
        StepConfig installPig =
            new StepConfig().withName("Pig " + emrClusterDefinition.getPigVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallPigStep(emrClusterDefinition.getPigVersion()));
        appSteps.add(installPig);
    }

    // Add the hadoop jar steps that need to be added.
    if (!CollectionUtils.isEmpty(emrClusterDefinition.getHadoopJarSteps()))
    {
        for (HadoopJarStep hadoopJarStep : emrClusterDefinition.getHadoopJarSteps())
        {
            StepConfig stepConfig = emrHelper
                .getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
                    hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());

            appSteps.add(stepConfig);
        }
    }

    return appSteps;
}

Source File: EmrDaoImpl.java From herd with Apache License 2.0

5 votes

@Override
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception
{
    List<StepConfig> steps = new ArrayList<>();

    steps.add(emrStepConfig);

    // Add the job flow request
    AddJobFlowStepsRequest jobFlowStepRequest = new AddJobFlowStepsRequest(clusterId, steps);
    List<String> emrStepIds = emrOperations.addJobFlowStepsRequest(getEmrClient(awsParamsDto), jobFlowStepRequest);

    return emrStepIds.get(0);
}

Source File: EmrHelper.java From herd with Apache License 2.0

5 votes

/**
 * Builds the StepConfig for the Hadoop jar step.
 *
 * @param stepName the step name.
 * @param jarLocation the location of jar.
 * @param mainClass the main class.
 * @param scriptArguments the arguments.
 * @param isContinueOnError indicate what to do on error.
 *
 * @return the stepConfig.
 */
public StepConfig getEmrHadoopJarStepConfig(String stepName, String jarLocation, String mainClass, List<String> scriptArguments, Boolean isContinueOnError)
{
    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (isContinueOnError != null && isContinueOnError)
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments
    if (CollectionUtils.isEmpty(scriptArguments))
    {
        // Build the StepConfig object and return
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass));
    }
    else
    {
        // If there are arguments, include the arguments in the StepConfig object
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass)
                .withArgs(scriptArguments.toArray(new String[scriptArguments.size()])));
    }
}

Source File: EmrShellStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrShellStep emrShellStep = (EmrShellStep) step;

    // Hadoop Jar provided by Amazon for running Shell Scripts
    String hadoopJarForShellScript = configurationHelper.getProperty(ConfigurationValue.EMR_SHELL_SCRIPT_JAR_PATH);

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;
    if (emrShellStep.isContinueOnError() != null && emrShellStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // Add the script location
    List<String> argsList = new ArrayList<>();
    argsList.add(emrShellStep.getScriptLocation().trim());

    // Add the script arguments
    if (!CollectionUtils.isEmpty(emrShellStep.getScriptArguments()))
    {
        for (String argument : emrShellStep.getScriptArguments())
        {
            argsList.add(argument.trim());
        }
    }

    // Return the StepConfig object
    HadoopJarStepConfig jarConfig = new HadoopJarStepConfig(hadoopJarForShellScript).withArgs(argsList);
    return new StepConfig().withName(emrShellStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(jarConfig);
}

Source File: EmrHadoopJarStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHadoopJarStep hadoopJarStep = (EmrHadoopJarStep) step;

    return emrHelper.getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
        hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());
}

Source File: EmrHiveStepHelper.java From herd with Apache License 2.0

5 votes

@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHiveStep emrHiveStep = (EmrHiveStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (emrHiveStep.isContinueOnError() != null && emrHiveStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(emrHiveStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        // For each argument, add "-d" option
        List<String> hiveArgs = new ArrayList<>();
        for (String hiveArg : emrHiveStep.getScriptArguments())
        {
            hiveArgs.add("-d");
            hiveArgs.add(hiveArg);
        }
        // Return the StepConfig object
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim(), hiveArgs.toArray(new String[hiveArgs.size()])));
    }
}

Source File: EmrDaoTest.java From herd with Apache License 2.0

4 votes

@Test
public void createEmrClusterAssertCallRunEmrJobFlowWithInstanceFleetAndMultipleSubnets()
{
    // Create objects required for testing.
    final String clusterName = "clusterName";
    final String clusterId = "clusterId";
    final String name = STRING_VALUE;
    final String instanceFleetType = STRING_VALUE_2;
    final Integer targetOnDemandCapacity = INTEGER_VALUE;
    final Integer targetSpotCapacity = INTEGER_VALUE_2;
    final List<EmrClusterDefinitionInstanceTypeConfig> emrClusterDefinitionInstanceTypeConfigs = null;
    final EmrClusterDefinitionLaunchSpecifications emrClusterDefinitionLaunchSpecifications = null;
    final EmrClusterDefinitionInstanceFleet emrClusterDefinitionInstanceFleet =
        new EmrClusterDefinitionInstanceFleet(name, instanceFleetType, targetOnDemandCapacity, targetSpotCapacity, emrClusterDefinitionInstanceTypeConfigs,
            emrClusterDefinitionLaunchSpecifications);

    // Create an EMR cluster definition with instance fleet configuration and multiple EC2 subnet IDs.
    EmrClusterDefinition emrClusterDefinition = new EmrClusterDefinition();
    emrClusterDefinition.setInstanceFleets(Lists.newArrayList(emrClusterDefinitionInstanceFleet));
    emrClusterDefinition.setSubnetId(String.format("%s , %s  ", EC2_SUBNET, EC2_SUBNET_2));
    emrClusterDefinition.setNodeTags(Lists.newArrayList(new NodeTag("tagName", "tagValue")));

    when(mockEmrOperations.runEmrJobFlow(any(), any())).then(new Answer<String>()
    {
        @Override
        public String answer(InvocationOnMock invocation)
        {
            // Assert that the given EMR cluster definition produced the correct RunJobFlowRequest.
            RunJobFlowRequest runJobFlowRequest = invocation.getArgument(1);
            JobFlowInstancesConfig jobFlowInstancesConfig = runJobFlowRequest.getInstances();
            assertEquals(0, CollectionUtils.size(jobFlowInstancesConfig.getInstanceGroups()));
            final List<InstanceTypeConfig> expectedInstanceTypeConfigs = null;
            assertEquals(Lists.newArrayList(
                new InstanceFleetConfig().withName(name).withInstanceFleetType(instanceFleetType).withTargetOnDemandCapacity(targetOnDemandCapacity)
                    .withTargetSpotCapacity(targetSpotCapacity).withInstanceTypeConfigs(expectedInstanceTypeConfigs).withLaunchSpecifications(null)),
                jobFlowInstancesConfig.getInstanceFleets());
            assertNull(jobFlowInstancesConfig.getEc2SubnetId());
            assertEquals(2, CollectionUtils.size(jobFlowInstancesConfig.getEc2SubnetIds()));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET_2));
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_EC2_NODE_IAM_PROFILE_NAME),
                runJobFlowRequest.getJobFlowRole());
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_SERVICE_IAM_ROLE_NAME),
                runJobFlowRequest.getServiceRole());
            List<StepConfig> stepConfigs = runJobFlowRequest.getSteps();
            assertEquals(0, stepConfigs.size());
            List<Tag> tags = runJobFlowRequest.getTags();
            assertEquals(1, tags.size());
            {
                Tag tag = tags.get(0);
                assertEquals("tagName", tag.getKey());
                assertEquals("tagValue", tag.getValue());
            }

            return clusterId;
        }
    });

    assertEquals(clusterId, emrDao.createEmrCluster(clusterName, emrClusterDefinition, getAwsParamsDto()));
}

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

4 votes

private NewCluster submitNewClusterRequest(AmazonElasticMapReduce emr, String tag, StepCompiler stepCompiler,
        Config cluster, Filer filer, ParameterCompiler parameterCompiler)
        throws IOException
{
    RemoteFile runner = prepareRunner(filer, tag);

    // Compile steps
    stepCompiler.compile(runner);

    List<StepConfig> stepConfigs = stepCompiler.stepConfigs();

    Config ec2 = cluster.getNested("ec2");
    Config master = ec2.getNestedOrGetEmpty("master");
    List<Config> core = ec2.getOptional("core", Config.class).transform(ImmutableList::of).or(ImmutableList.of());
    List<Config> task = ec2.getListOrEmpty("task", Config.class);

    List<String> applications = cluster.getListOrEmpty("applications", String.class);
    if (applications.isEmpty()) {
        applications = ImmutableList.of("Hadoop", "Hive", "Spark", "Flink");
    }

    // TODO: allow configuring additional application parameters
    List<Application> applicationConfigs = applications.stream()
            .map(application -> new Application().withName(application))
            .collect(toList());

    // TODO: merge configurations with the same classification?
    List<Configuration> configurations = cluster.getListOrEmpty("configurations", JsonNode.class).stream()
            .map(this::configurations)
            .flatMap(Collection::stream)
            .collect(toList());

    List<JsonNode> bootstrap = cluster.getListOrEmpty("bootstrap", JsonNode.class);
    List<BootstrapActionConfig> bootstrapActions = new ArrayList<>();
    for (int i = 0; i < bootstrap.size(); i++) {
        bootstrapActions.add(bootstrapAction(i + 1, bootstrap.get(i), tag, filer, runner, parameterCompiler));
    }

    // Stage files to S3
    filer.stageFiles();

    Optional<String> subnetId = ec2.getOptional("subnet_id", String.class);

    String defaultMasterInstanceType;
    String defaultCoreInstanceType;
    String defaultTaskInstanceType;

    if (subnetId.isPresent()) {
        // m4 requires VPC (subnet id)
        defaultMasterInstanceType = "m4.2xlarge";
        defaultCoreInstanceType = "m4.xlarge";
        defaultTaskInstanceType = "m4.xlarge";
    }
    else {
        defaultMasterInstanceType = "m3.2xlarge";
        defaultCoreInstanceType = "m3.xlarge";
        defaultTaskInstanceType = "m3.xlarge";
    }

    RunJobFlowRequest request = new RunJobFlowRequest()
            .withName(cluster.get("name", String.class, "Digdag") + " (" + tag + ")")
            .withReleaseLabel(cluster.get("release", String.class, "emr-5.2.0"))
            .withSteps(stepConfigs)
            .withBootstrapActions(bootstrapActions)
            .withApplications(applicationConfigs)
            .withLogUri(cluster.get("logs", String.class, null))
            .withJobFlowRole(cluster.get("cluster_role", String.class, "EMR_EC2_DefaultRole"))
            .withServiceRole(cluster.get("service_role", String.class, "EMR_DefaultRole"))
            .withTags(new Tag().withKey("DIGDAG_CLUSTER_ID").withValue(tag))
            .withVisibleToAllUsers(cluster.get("visible", boolean.class, true))
            .withConfigurations(configurations)
            .withInstances(new JobFlowInstancesConfig()
                    .withInstanceGroups(ImmutableList.<InstanceGroupConfig>builder()
                            // Master Node
                            .add(instanceGroupConfig("Master", master, "MASTER", defaultMasterInstanceType, 1))
                            // Core Group
                            .addAll(instanceGroupConfigs("Core", core, "CORE", defaultCoreInstanceType))
                            // Task Groups
                            .addAll(instanceGroupConfigs("Task %d", task, "TASK", defaultTaskInstanceType))
                            .build()
                    )
                    .withAdditionalMasterSecurityGroups(ec2.getListOrEmpty("additional_master_security_groups", String.class))
                    .withAdditionalSlaveSecurityGroups(ec2.getListOrEmpty("additional_slave_security_groups", String.class))
                    .withEmrManagedMasterSecurityGroup(ec2.get("emr_managed_master_security_group", String.class, null))
                    .withEmrManagedSlaveSecurityGroup(ec2.get("emr_managed_slave_security_group", String.class, null))
                    .withServiceAccessSecurityGroup(ec2.get("service_access_security_group", String.class, null))
                    .withTerminationProtected(cluster.get("termination_protected", boolean.class, false))
                    .withPlacement(cluster.getOptional("availability_zone", String.class)
                            .transform(zone -> new PlacementType().withAvailabilityZone(zone)).orNull())
                    .withEc2SubnetId(subnetId.orNull())
                    .withEc2KeyName(ec2.get("key", String.class))
                    .withKeepJobFlowAliveWhenNoSteps(!cluster.get("auto_terminate", boolean.class, true)));

    logger.info("Submitting EMR job with {} steps(s)", request.getSteps().size());
    RunJobFlowResult result = emr.runJobFlow(request);
    logger.info("Submitted EMR job with {} step(s): {}", request.getSteps().size(), result.getJobFlowId(), result);

    return NewCluster.of(result.getJobFlowId(), request.getSteps().size());
}

Source File: EmrOperatorFactory.java From digdag with Apache License 2.0

4 votes

List<StepConfig> stepConfigs()
{
    Preconditions.checkState(configs != null);
    return configs;
}

Source File: EMRUtils.java From aws-big-data-blog with Apache License 2.0

4 votes

/**
 * This method uses method the AWS Java to launch an Apache HBase cluster on Amazon EMR. 
 * 
 * @param client - AmazonElasticMapReduce client that interfaces directly with the Amazon EMR Web Service
 * @param clusterIdentifier - identifier of an existing cluster
 * @param amiVersion - AMI to use for launching this cluster
 * @param keypair - A keypair for SSHing into the Amazon EMR master node
 * @param masterInstanceType - Master node Amazon EC2 instance type 
 * @param coreInstanceType - core nodes Amazon EC2 instance type 
 * @param logUri - An Amazon S3 bucket for your 
 * @param numberOfNodes - total number of nodes in this cluster including master node
 * @return
 */
public static String createCluster(AmazonElasticMapReduce client,
		String clusterIdentifier,
		String amiVersion,
		String keypair,
		String masterInstanceType,
		String coreInstanceType,
		String logUri,
		int numberOfNodes) {

	if (clusterExists(client, clusterIdentifier)) {
		LOG.info("Cluster " + clusterIdentifier + " is available");
		return clusterIdentifier;
	}
	
	//Error checking
	if (amiVersion == null || amiVersion.isEmpty()) throw new RuntimeException("ERROR: Please specify an AMI Version");
	if (keypair == null || keypair.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon Key Pair");
	if (masterInstanceType == null || masterInstanceType.isEmpty()) throw new RuntimeException("ERROR: Please specify a Master Instance Type");
	if (logUri == null || logUri.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon S3 bucket for your logs.");
	if (numberOfNodes < 0) throw new RuntimeException("ERROR: Please specify at least 1 node");
	  		
	  RunJobFlowRequest request = new RunJobFlowRequest()
	    .withAmiVersion(amiVersion)
		.withBootstrapActions(new BootstrapActionConfig()
		             .withName("Install HBase")
		             .withScriptBootstrapAction(new ScriptBootstrapActionConfig()
		             .withPath("s3://elasticmapreduce/bootstrap-actions/setup-hbase")))
		.withName("Job Flow With HBAse Actions")	 
		.withSteps(new StepConfig() //enable debugging step
					.withName("Enable debugging")
					.withActionOnFailure("TERMINATE_CLUSTER")
					.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()), 
					//Start HBase step - after installing it with a bootstrap action
					createStepConfig("Start HBase","TERMINATE_CLUSTER", "/home/hadoop/lib/hbase.jar", getHBaseArgs()), 
					//add HBase backup step
					createStepConfig("Modify backup schedule","TERMINATE_JOB_FLOW", "/home/hadoop/lib/hbase.jar", getHBaseBackupArgs()))
		.withLogUri(logUri)
		.withInstances(new JobFlowInstancesConfig()
		.withEc2KeyName(keypair)
		.withInstanceCount(numberOfNodes)
		.withKeepJobFlowAliveWhenNoSteps(true)
		.withMasterInstanceType(masterInstanceType)
		.withSlaveInstanceType(coreInstanceType));

	RunJobFlowResult result = client.runJobFlow(request);
	
	String state = null;
	while (!(state = clusterState(client, result.getJobFlowId())).equalsIgnoreCase("waiting")) {
		try {
			Thread.sleep(10 * 1000);
			LOG.info(result.getJobFlowId() + " is " + state + ". Waiting for cluster to become available.");
		} catch (InterruptedException e) {

		}
		
		if (state.equalsIgnoreCase("TERMINATED_WITH_ERRORS")){
			LOG.error("Could not create EMR Cluster");
			System.exit(-1);	
		}
	}
	LOG.info("Created cluster " + result.getJobFlowId());
	LOG.info("Cluster " + clusterIdentifier + " is available");	
	return result.getJobFlowId();
}

Source File: EmrDao.java From herd with Apache License 2.0

2 votes

/**
 * Add an EMR Step. This method adds the step to EMR cluster based on the input.
 *
 * @param clusterId EMR cluster ID.
 * @param emrStepConfig the EMR step config to be added.
 * @param awsParamsDto the proxy details.
 * <p/>
 * There are four serializable objects supported currently. They are 1: ShellStep - For shell scripts 2: HiveStep - For hive scripts 3: HadoopJarStep - For
 * Custom Map Reduce Jar files and 4: PigStep - For Pig scripts.
 *
 * @return the step id
 */
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception;

Source File: EmrStepHelper.java From herd with Apache License 2.0

2 votes

/**
 * This method gets the StepConfig object for the given Step.
 *
 * @param step the step object
 *
 * @return the step config object
 */
public abstract StepConfig getEmrStepConfig(Object step);

com.amazonaws.services.elasticmapreduce.model.StepConfig Java Examples