Java Code Examples for com.amazonaws.services.elasticmapreduce.model.StepConfig

The following examples show how to use com.amazonaws.services.elasticmapreduce.model.StepConfig. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: herd   Source File: MockEmrOperationsImpl.java    License: Apache License 2.0 6 votes vote down vote up
private MockEmrJobFlow createNewCluster(RunJobFlowRequest jobFlowRequest, String status, StatusChangeReason reason, StatusTimeline timeline)
{
    MockEmrJobFlow cluster = new MockEmrJobFlow();
    cluster.setJobFlowId(getNewJobFlowId());
    cluster.setJobFlowName(jobFlowRequest.getName());
    cluster.setStatus(status);
    cluster.setStatusTimeline(timeline);
    cluster.setStatusChangeReason(reason);
    emrClusters.put(cluster.getJobFlowId(), cluster);

    // Add the steps
    for (StepConfig stepConfig : jobFlowRequest.getSteps())
    {
        addClusterStep(cluster.getJobFlowId(), stepConfig);
    }

    return cluster;
}
 
Example 2
Source Project: datacollector   Source File: EmrClusterJob.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Properties submitJob(Properties jobProps) throws IOException {
  EMRJobConfig emrJobConfig = new EMRJobConfig(jobProps);
  Utils.checkNotNull(emrJobConfig.getClusterId(), "EMR Cluster Id");
  StepConfig stepConfig = new StepConfig()
      .withName(emrJobConfig.getJobName())
      .withActionOnFailure(ActionOnFailure.CONTINUE) // check if action on failure needs to be configurable
      .withHadoopJarStep(new HadoopJarStepConfig()
          .withJar(emrJobConfig.getDriverJarPath())
          .withMainClass(emrJobConfig.getDriverMainClass()).withArgs(
              emrJobConfig.getArchives(),
              emrJobConfig.getLibjars(),
              emrJobConfig.getUniquePrefix(),
              emrJobConfig.getJavaOpts(),
              emrJobConfig.getLogLevel()
          ));
  LOG.debug("Step config is {}", stepConfig.toString());
  AddJobFlowStepsResult addJobFlowStepsResult = getEmrClient(emrClusterConfig).addJobFlowSteps(
      new AddJobFlowStepsRequest()
          .withJobFlowId(emrJobConfig.getClusterId())
          .withSteps(stepConfig));
  String stepId = addJobFlowStepsResult.getStepIds().get(0);
  jobProps.setProperty("stepId", stepId);
  return jobProps;
}
 
Example 3
Source Project: herd   Source File: EmrPigStepHelper.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrPigStep pigStep = (EmrPigStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (pigStep.isContinueOnError() != null && pigStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(pigStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunPigScriptStep(pigStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(new StepFactory()
            .newRunPigScriptStep(pigStep.getScriptLocation().trim(),
                pigStep.getScriptArguments().toArray(new String[pigStep.getScriptArguments().size()])));
    }
}
 
Example 4
Source Project: herd   Source File: EmrHiveStepHelper.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHiveStep emrHiveStep = (EmrHiveStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (emrHiveStep.isContinueOnError() != null && emrHiveStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(emrHiveStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        // For each argument, add "-d" option
        List<String> hiveArgs = new ArrayList<>();
        for (String hiveArg : emrHiveStep.getScriptArguments())
        {
            hiveArgs.add("-d");
            hiveArgs.add(hiveArg);
        }
        // Return the StepConfig object
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim(), hiveArgs.toArray(new String[hiveArgs.size()])));
    }
}
 
Example 5
Source Project: herd   Source File: EmrHadoopJarStepHelper.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHadoopJarStep hadoopJarStep = (EmrHadoopJarStep) step;

    return emrHelper.getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
        hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());
}
 
Example 6
Source Project: herd   Source File: EmrShellStepHelper.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrShellStep emrShellStep = (EmrShellStep) step;

    // Hadoop Jar provided by Amazon for running Shell Scripts
    String hadoopJarForShellScript = configurationHelper.getProperty(ConfigurationValue.EMR_SHELL_SCRIPT_JAR_PATH);

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;
    if (emrShellStep.isContinueOnError() != null && emrShellStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // Add the script location
    List<String> argsList = new ArrayList<>();
    argsList.add(emrShellStep.getScriptLocation().trim());

    // Add the script arguments
    if (!CollectionUtils.isEmpty(emrShellStep.getScriptArguments()))
    {
        for (String argument : emrShellStep.getScriptArguments())
        {
            argsList.add(argument.trim());
        }
    }

    // Return the StepConfig object
    HadoopJarStepConfig jarConfig = new HadoopJarStepConfig(hadoopJarForShellScript).withArgs(argsList);
    return new StepConfig().withName(emrShellStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(jarConfig);
}
 
Example 7
Source Project: herd   Source File: EmrHelper.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Builds the StepConfig for the Hadoop jar step.
 *
 * @param stepName the step name.
 * @param jarLocation the location of jar.
 * @param mainClass the main class.
 * @param scriptArguments the arguments.
 * @param isContinueOnError indicate what to do on error.
 *
 * @return the stepConfig.
 */
public StepConfig getEmrHadoopJarStepConfig(String stepName, String jarLocation, String mainClass, List<String> scriptArguments, Boolean isContinueOnError)
{
    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (isContinueOnError != null && isContinueOnError)
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments
    if (CollectionUtils.isEmpty(scriptArguments))
    {
        // Build the StepConfig object and return
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass));
    }
    else
    {
        // If there are arguments, include the arguments in the StepConfig object
        return new StepConfig().withName(stepName.trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new HadoopJarStepConfig().withJar(jarLocation.trim()).withMainClass(mainClass)
                .withArgs(scriptArguments.toArray(new String[scriptArguments.size()])));
    }
}
 
Example 8
Source Project: herd   Source File: EmrDaoImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception
{
    List<StepConfig> steps = new ArrayList<>();

    steps.add(emrStepConfig);

    // Add the job flow request
    AddJobFlowStepsRequest jobFlowStepRequest = new AddJobFlowStepsRequest(clusterId, steps);
    List<String> emrStepIds = emrOperations.addJobFlowStepsRequest(getEmrClient(awsParamsDto), jobFlowStepRequest);

    return emrStepIds.get(0);
}
 
Example 9
Source Project: herd   Source File: EmrDaoImpl.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create the step config list of objects for hive/pig installation.
 *
 * @param emrClusterDefinition the EMR definition name value.
 *
 * @return list of step configuration that contains all the steps for the given configuration.
 */
private List<StepConfig> getStepConfig(EmrClusterDefinition emrClusterDefinition)
{
    StepFactory stepFactory = new StepFactory();
    List<StepConfig> appSteps = new ArrayList<>();

    // Create install hive step and add to the StepConfig list
    if (StringUtils.isNotBlank(emrClusterDefinition.getHiveVersion()))
    {
        StepConfig installHive =
            new StepConfig().withName("Hive " + emrClusterDefinition.getHiveVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallHiveStep(emrClusterDefinition.getHiveVersion()));
        appSteps.add(installHive);
    }

    // Create install Pig step and add to the StepConfig List
    if (StringUtils.isNotBlank(emrClusterDefinition.getPigVersion()))
    {
        StepConfig installPig =
            new StepConfig().withName("Pig " + emrClusterDefinition.getPigVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallPigStep(emrClusterDefinition.getPigVersion()));
        appSteps.add(installPig);
    }

    // Add the hadoop jar steps that need to be added.
    if (!CollectionUtils.isEmpty(emrClusterDefinition.getHadoopJarSteps()))
    {
        for (HadoopJarStep hadoopJarStep : emrClusterDefinition.getHadoopJarSteps())
        {
            StepConfig stepConfig = emrHelper
                .getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
                    hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());

            appSteps.add(stepConfig);
        }
    }

    return appSteps;
}
 
Example 10
Source Project: herd   Source File: EmrHelperTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEmrHadoopJarStepConfig()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, false);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}
 
Example 11
Source Project: herd   Source File: EmrHelperTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEmrHadoopJarStepConfigNoContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, null);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}
 
Example 12
Source Project: herd   Source File: EmrHelperTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEmrHadoopJarStepConfigContinueOnError()
{
    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, null, true);

    assertNotNull("step not retuned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
}
 
Example 13
Source Project: herd   Source File: EmrHelperTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEmrHadoopJarStepConfigWithArguments()
{
    List<String> arguments = new ArrayList<>();
    arguments.add("arg1");

    StepConfig stepConfig = emrHelper.getEmrHadoopJarStepConfig("step_name", "jar_location", null, arguments, false);

    assertNotNull("step not returned", stepConfig);

    assertEquals("name not found", "step_name", stepConfig.getName());
    assertEquals("jar not found", "jar_location", stepConfig.getHadoopJarStep().getJar());
    assertNotNull("arguments not found", stepConfig.getHadoopJarStep().getArgs());
}
 
Example 14
Source Project: herd   Source File: MockEmrOperationsImpl.java    License: Apache License 2.0 5 votes vote down vote up
private MockEmrJobFlow addClusterStep(String jobFlowId, StepConfig step)
{
    List<MockEmrJobFlow> mockSteps = getStepsByClusterId(jobFlowId);
    if (mockSteps == null)
    {
        mockSteps = new ArrayList<>();
    }

    MockEmrJobFlow mockStep = new MockEmrJobFlow();
    if (!step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setJobFlowId(getNewJobFlowId());
    }
    mockStep.setJobFlowName(step.getName());
    if (step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_NAME) || step.getName().equalsIgnoreCase(MOCK_STEP_RUNNING_WITHOUT_ID_NAME))
    {
        mockStep.setStatus(StepState.RUNNING.toString());
    }
    else
    {
        mockStep.setStatus(StepState.PENDING.toString());
    }
    mockStep.setJarLocation(step.getHadoopJarStep().getJar());

    mockSteps.add(mockStep);
    setStepsByClusterId(jobFlowId, mockSteps);
    return mockStep;
}
 
Example 15
Source Project: digdag   Source File: EmrOperatorFactory.java    License: Apache License 2.0 5 votes vote down vote up
private void addStep(String name, CommandRunnerConfiguration configuration)
        throws IOException
{
    FileReference configurationFileReference = ImmutableFileReference.builder()
            .type(FileReference.Type.DIRECT)
            .contents(objectMapper.writeValueAsBytes(configuration))
            .filename("config.json")
            .build();
    RemoteFile remoteConfigurationFile = prepareRemoteFile(configurationFileReference, false);

    StepConfig runStep = stepConfig(name, tag, step)
            .withHadoopJarStep(stepFactory().newScriptRunnerStep(runner.s3Uri().toString(), remoteConfigurationFile.s3Uri().toString()));

    configs.add(runStep);
}
 
Example 16
Source Project: digdag   Source File: EmrOperatorFactory.java    License: Apache License 2.0 5 votes vote down vote up
private StepConfig stepConfig(String defaultName, String tag, Config step)
{
    String name = step.get("name", String.class, defaultName);
    return new StepConfig()
            .withName(name + " (" + tag + ")")
            // TERMINATE_JOB_FLOW | TERMINATE_CLUSTER | CANCEL_AND_WAIT | CONTINUE
            .withActionOnFailure(step.get("action_on_failure", String.class, defaultActionOnFailure));
}
 
Example 17
Source Project: datacollector   Source File: EmrClusterJob.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public String createCluster(String clusterName) {
  RunJobFlowRequest request = new RunJobFlowRequest()
      .withName(clusterName)
      .withReleaseLabel(EmrInfo.getVersion())
      .withServiceRole(emrClusterConfig.getServiceRole())
      .withJobFlowRole(emrClusterConfig.getJobFlowRole())
      .withVisibleToAllUsers(emrClusterConfig.isVisibleToAllUsers())
      .withInstances(new JobFlowInstancesConfig()
          .withEc2SubnetId(emrClusterConfig.getEc2SubnetId())
          .withEmrManagedMasterSecurityGroup(emrClusterConfig.getMasterSecurityGroup())
          .withEmrManagedSlaveSecurityGroup(emrClusterConfig.getSlaveSecurityGroup())
          .withInstanceCount(emrClusterConfig.getInstanceCount())
          .withKeepJobFlowAliveWhenNoSteps(true)
          .withMasterInstanceType(emrClusterConfig.getMasterInstanceType())
          .withSlaveInstanceType(emrClusterConfig.getSlaveInstanceType()));

  if (emrClusterConfig.isLoggingEnabled()) {
    request.withLogUri(emrClusterConfig.getS3LogUri());
    if (emrClusterConfig.isEnableEmrDebugging()) {
      String COMMAND_RUNNER = "command-runner.jar";
      String DEBUGGING_COMMAND = "state-pusher-script";
      String DEBUGGING_NAME = "Setup Hadoop Debugging";
      StepConfig enabledebugging = new StepConfig()
          .withName(DEBUGGING_NAME)
          .withActionOnFailure(ActionOnFailure.CONTINUE)
          .withHadoopJarStep(new HadoopJarStepConfig()
              .withJar(COMMAND_RUNNER)
              .withArgs(DEBUGGING_COMMAND));
      request.withSteps(enabledebugging);
    }
  }
  RunJobFlowResult result = getEmrClient(emrClusterConfig).runJobFlow(request);
  return result.getJobFlowId();
}
 
Example 18
Source Project: aws-big-data-blog   Source File: EMRUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This is a helper method for creating step configuration information
 * @param stepName - a custom name to label this step
 * @param actionOnFailure - options are terminate cluster, terminate job flow, contiunue
 * @param jarPath - path to jar file - could be on S3 or  local file system
 * @param args list of Java args to configure custom step
 * @return
 */
private static StepConfig createStepConfig(String stepName, String actionOnFailure, String jarPath, List<String> args ) {
	//Start HBase step - after installing it with a bootstrap action
			StepConfig stepConfig = new StepConfig()
				.withName(stepName)
				.withActionOnFailure(actionOnFailure)
				.withHadoopJarStep(new HadoopJarStepConfig()
									.withJar(jarPath)
									.withArgs(args));
			return stepConfig;
}
 
Example 19
Source Project: aws-big-data-blog   Source File: LambdaContainer.java    License: Apache License 2.0 5 votes vote down vote up
protected String fireEMRJob(String paramsStr,String clusterId){
	StepFactory stepFactory = new StepFactory();
	AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
	emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
	Application sparkConfig = new Application()
			.withName("Spark");
	
	String[] params = paramsStr.split(",");
	StepConfig enabledebugging = new StepConfig()
			.withName("Enable debugging")
			.withActionOnFailure("TERMINATE_JOB_FLOW")
			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
	
	HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
			.withJar("command-runner.jar")
			.withArgs(params);	
	
	final StepConfig sparkStep = new StepConfig()
			.withName("Spark Step")
			.withActionOnFailure("CONTINUE")
			.withHadoopJarStep(sparkStepConf);

	
	AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
			.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
			

	AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
	return result.getStepIds().get(0);
}
 
Example 20
Source Project: herd   Source File: EmrDaoTest.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void createEmrClusterAssertCallRunEmrJobFlowWithInstanceFleetAndMultipleSubnets()
{
    // Create objects required for testing.
    final String clusterName = "clusterName";
    final String clusterId = "clusterId";
    final String name = STRING_VALUE;
    final String instanceFleetType = STRING_VALUE_2;
    final Integer targetOnDemandCapacity = INTEGER_VALUE;
    final Integer targetSpotCapacity = INTEGER_VALUE_2;
    final List<EmrClusterDefinitionInstanceTypeConfig> emrClusterDefinitionInstanceTypeConfigs = null;
    final EmrClusterDefinitionLaunchSpecifications emrClusterDefinitionLaunchSpecifications = null;
    final EmrClusterDefinitionInstanceFleet emrClusterDefinitionInstanceFleet =
        new EmrClusterDefinitionInstanceFleet(name, instanceFleetType, targetOnDemandCapacity, targetSpotCapacity, emrClusterDefinitionInstanceTypeConfigs,
            emrClusterDefinitionLaunchSpecifications);

    // Create an EMR cluster definition with instance fleet configuration and multiple EC2 subnet IDs.
    EmrClusterDefinition emrClusterDefinition = new EmrClusterDefinition();
    emrClusterDefinition.setInstanceFleets(Lists.newArrayList(emrClusterDefinitionInstanceFleet));
    emrClusterDefinition.setSubnetId(String.format("%s , %s  ", EC2_SUBNET, EC2_SUBNET_2));
    emrClusterDefinition.setNodeTags(Lists.newArrayList(new NodeTag("tagName", "tagValue")));

    when(mockEmrOperations.runEmrJobFlow(any(), any())).then(new Answer<String>()
    {
        @Override
        public String answer(InvocationOnMock invocation)
        {
            // Assert that the given EMR cluster definition produced the correct RunJobFlowRequest.
            RunJobFlowRequest runJobFlowRequest = invocation.getArgument(1);
            JobFlowInstancesConfig jobFlowInstancesConfig = runJobFlowRequest.getInstances();
            assertEquals(0, CollectionUtils.size(jobFlowInstancesConfig.getInstanceGroups()));
            final List<InstanceTypeConfig> expectedInstanceTypeConfigs = null;
            assertEquals(Lists.newArrayList(
                new InstanceFleetConfig().withName(name).withInstanceFleetType(instanceFleetType).withTargetOnDemandCapacity(targetOnDemandCapacity)
                    .withTargetSpotCapacity(targetSpotCapacity).withInstanceTypeConfigs(expectedInstanceTypeConfigs).withLaunchSpecifications(null)),
                jobFlowInstancesConfig.getInstanceFleets());
            assertNull(jobFlowInstancesConfig.getEc2SubnetId());
            assertEquals(2, CollectionUtils.size(jobFlowInstancesConfig.getEc2SubnetIds()));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET));
            assertTrue(jobFlowInstancesConfig.getEc2SubnetIds().contains(EC2_SUBNET_2));
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_EC2_NODE_IAM_PROFILE_NAME),
                runJobFlowRequest.getJobFlowRole());
            assertEquals(herdStringHelper.getRequiredConfigurationValue(ConfigurationValue.EMR_DEFAULT_SERVICE_IAM_ROLE_NAME),
                runJobFlowRequest.getServiceRole());
            List<StepConfig> stepConfigs = runJobFlowRequest.getSteps();
            assertEquals(0, stepConfigs.size());
            List<Tag> tags = runJobFlowRequest.getTags();
            assertEquals(1, tags.size());
            {
                Tag tag = tags.get(0);
                assertEquals("tagName", tag.getKey());
                assertEquals("tagValue", tag.getValue());
            }

            return clusterId;
        }
    });

    assertEquals(clusterId, emrDao.createEmrCluster(clusterName, emrClusterDefinition, getAwsParamsDto()));
}
 
Example 21
Source Project: digdag   Source File: EmrOperatorFactory.java    License: Apache License 2.0 4 votes vote down vote up
private NewCluster submitNewClusterRequest(AmazonElasticMapReduce emr, String tag, StepCompiler stepCompiler,
        Config cluster, Filer filer, ParameterCompiler parameterCompiler)
        throws IOException
{
    RemoteFile runner = prepareRunner(filer, tag);

    // Compile steps
    stepCompiler.compile(runner);

    List<StepConfig> stepConfigs = stepCompiler.stepConfigs();

    Config ec2 = cluster.getNested("ec2");
    Config master = ec2.getNestedOrGetEmpty("master");
    List<Config> core = ec2.getOptional("core", Config.class).transform(ImmutableList::of).or(ImmutableList.of());
    List<Config> task = ec2.getListOrEmpty("task", Config.class);

    List<String> applications = cluster.getListOrEmpty("applications", String.class);
    if (applications.isEmpty()) {
        applications = ImmutableList.of("Hadoop", "Hive", "Spark", "Flink");
    }

    // TODO: allow configuring additional application parameters
    List<Application> applicationConfigs = applications.stream()
            .map(application -> new Application().withName(application))
            .collect(toList());

    // TODO: merge configurations with the same classification?
    List<Configuration> configurations = cluster.getListOrEmpty("configurations", JsonNode.class).stream()
            .map(this::configurations)
            .flatMap(Collection::stream)
            .collect(toList());

    List<JsonNode> bootstrap = cluster.getListOrEmpty("bootstrap", JsonNode.class);
    List<BootstrapActionConfig> bootstrapActions = new ArrayList<>();
    for (int i = 0; i < bootstrap.size(); i++) {
        bootstrapActions.add(bootstrapAction(i + 1, bootstrap.get(i), tag, filer, runner, parameterCompiler));
    }

    // Stage files to S3
    filer.stageFiles();

    Optional<String> subnetId = ec2.getOptional("subnet_id", String.class);

    String defaultMasterInstanceType;
    String defaultCoreInstanceType;
    String defaultTaskInstanceType;

    if (subnetId.isPresent()) {
        // m4 requires VPC (subnet id)
        defaultMasterInstanceType = "m4.2xlarge";
        defaultCoreInstanceType = "m4.xlarge";
        defaultTaskInstanceType = "m4.xlarge";
    }
    else {
        defaultMasterInstanceType = "m3.2xlarge";
        defaultCoreInstanceType = "m3.xlarge";
        defaultTaskInstanceType = "m3.xlarge";
    }

    RunJobFlowRequest request = new RunJobFlowRequest()
            .withName(cluster.get("name", String.class, "Digdag") + " (" + tag + ")")
            .withReleaseLabel(cluster.get("release", String.class, "emr-5.2.0"))
            .withSteps(stepConfigs)
            .withBootstrapActions(bootstrapActions)
            .withApplications(applicationConfigs)
            .withLogUri(cluster.get("logs", String.class, null))
            .withJobFlowRole(cluster.get("cluster_role", String.class, "EMR_EC2_DefaultRole"))
            .withServiceRole(cluster.get("service_role", String.class, "EMR_DefaultRole"))
            .withTags(new Tag().withKey("DIGDAG_CLUSTER_ID").withValue(tag))
            .withVisibleToAllUsers(cluster.get("visible", boolean.class, true))
            .withConfigurations(configurations)
            .withInstances(new JobFlowInstancesConfig()
                    .withInstanceGroups(ImmutableList.<InstanceGroupConfig>builder()
                            // Master Node
                            .add(instanceGroupConfig("Master", master, "MASTER", defaultMasterInstanceType, 1))
                            // Core Group
                            .addAll(instanceGroupConfigs("Core", core, "CORE", defaultCoreInstanceType))
                            // Task Groups
                            .addAll(instanceGroupConfigs("Task %d", task, "TASK", defaultTaskInstanceType))
                            .build()
                    )
                    .withAdditionalMasterSecurityGroups(ec2.getListOrEmpty("additional_master_security_groups", String.class))
                    .withAdditionalSlaveSecurityGroups(ec2.getListOrEmpty("additional_slave_security_groups", String.class))
                    .withEmrManagedMasterSecurityGroup(ec2.get("emr_managed_master_security_group", String.class, null))
                    .withEmrManagedSlaveSecurityGroup(ec2.get("emr_managed_slave_security_group", String.class, null))
                    .withServiceAccessSecurityGroup(ec2.get("service_access_security_group", String.class, null))
                    .withTerminationProtected(cluster.get("termination_protected", boolean.class, false))
                    .withPlacement(cluster.getOptional("availability_zone", String.class)
                            .transform(zone -> new PlacementType().withAvailabilityZone(zone)).orNull())
                    .withEc2SubnetId(subnetId.orNull())
                    .withEc2KeyName(ec2.get("key", String.class))
                    .withKeepJobFlowAliveWhenNoSteps(!cluster.get("auto_terminate", boolean.class, true)));

    logger.info("Submitting EMR job with {} steps(s)", request.getSteps().size());
    RunJobFlowResult result = emr.runJobFlow(request);
    logger.info("Submitted EMR job with {} step(s): {}", request.getSteps().size(), result.getJobFlowId(), result);

    return NewCluster.of(result.getJobFlowId(), request.getSteps().size());
}
 
Example 22
Source Project: digdag   Source File: EmrOperatorFactory.java    License: Apache License 2.0 4 votes vote down vote up
List<StepConfig> stepConfigs()
{
    Preconditions.checkState(configs != null);
    return configs;
}
 
Example 23
Source Project: aws-big-data-blog   Source File: EMRUtils.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * This method uses method the AWS Java to launch an Apache HBase cluster on Amazon EMR. 
 * 
 * @param client - AmazonElasticMapReduce client that interfaces directly with the Amazon EMR Web Service
 * @param clusterIdentifier - identifier of an existing cluster
 * @param amiVersion - AMI to use for launching this cluster
 * @param keypair - A keypair for SSHing into the Amazon EMR master node
 * @param masterInstanceType - Master node Amazon EC2 instance type 
 * @param coreInstanceType - core nodes Amazon EC2 instance type 
 * @param logUri - An Amazon S3 bucket for your 
 * @param numberOfNodes - total number of nodes in this cluster including master node
 * @return
 */
public static String createCluster(AmazonElasticMapReduce client,
		String clusterIdentifier,
		String amiVersion,
		String keypair,
		String masterInstanceType,
		String coreInstanceType,
		String logUri,
		int numberOfNodes) {

	if (clusterExists(client, clusterIdentifier)) {
		LOG.info("Cluster " + clusterIdentifier + " is available");
		return clusterIdentifier;
	}
	
	//Error checking
	if (amiVersion == null || amiVersion.isEmpty()) throw new RuntimeException("ERROR: Please specify an AMI Version");
	if (keypair == null || keypair.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon Key Pair");
	if (masterInstanceType == null || masterInstanceType.isEmpty()) throw new RuntimeException("ERROR: Please specify a Master Instance Type");
	if (logUri == null || logUri.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon S3 bucket for your logs.");
	if (numberOfNodes < 0) throw new RuntimeException("ERROR: Please specify at least 1 node");
	  		
	  RunJobFlowRequest request = new RunJobFlowRequest()
	    .withAmiVersion(amiVersion)
		.withBootstrapActions(new BootstrapActionConfig()
		             .withName("Install HBase")
		             .withScriptBootstrapAction(new ScriptBootstrapActionConfig()
		             .withPath("s3://elasticmapreduce/bootstrap-actions/setup-hbase")))
		.withName("Job Flow With HBAse Actions")	 
		.withSteps(new StepConfig() //enable debugging step
					.withName("Enable debugging")
					.withActionOnFailure("TERMINATE_CLUSTER")
					.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()), 
					//Start HBase step - after installing it with a bootstrap action
					createStepConfig("Start HBase","TERMINATE_CLUSTER", "/home/hadoop/lib/hbase.jar", getHBaseArgs()), 
					//add HBase backup step
					createStepConfig("Modify backup schedule","TERMINATE_JOB_FLOW", "/home/hadoop/lib/hbase.jar", getHBaseBackupArgs()))
		.withLogUri(logUri)
		.withInstances(new JobFlowInstancesConfig()
		.withEc2KeyName(keypair)
		.withInstanceCount(numberOfNodes)
		.withKeepJobFlowAliveWhenNoSteps(true)
		.withMasterInstanceType(masterInstanceType)
		.withSlaveInstanceType(coreInstanceType));

	RunJobFlowResult result = client.runJobFlow(request);
	
	String state = null;
	while (!(state = clusterState(client, result.getJobFlowId())).equalsIgnoreCase("waiting")) {
		try {
			Thread.sleep(10 * 1000);
			LOG.info(result.getJobFlowId() + " is " + state + ". Waiting for cluster to become available.");
		} catch (InterruptedException e) {

		}
		
		if (state.equalsIgnoreCase("TERMINATED_WITH_ERRORS")){
			LOG.error("Could not create EMR Cluster");
			System.exit(-1);	
		}
	}
	LOG.info("Created cluster " + result.getJobFlowId());
	LOG.info("Cluster " + clusterIdentifier + " is available");	
	return result.getJobFlowId();
}
 
Example 24
Source Project: herd   Source File: EmrStepHelper.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * This method gets the StepConfig object for the given Step.
 *
 * @param step the step object
 *
 * @return the step config object
 */
public abstract StepConfig getEmrStepConfig(Object step);
 
Example 25
Source Project: herd   Source File: EmrDao.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Add an EMR Step. This method adds the step to EMR cluster based on the input.
 *
 * @param clusterId EMR cluster ID.
 * @param emrStepConfig the EMR step config to be added.
 * @param awsParamsDto the proxy details.
 * <p/>
 * There are four serializable objects supported currently. They are 1: ShellStep - For shell scripts 2: HiveStep - For hive scripts 3: HadoopJarStep - For
 * Custom Map Reduce Jar files and 4: PigStep - For Pig scripts.
 *
 * @return the step id
 */
public String addEmrStep(String clusterId, StepConfig emrStepConfig, AwsParamsDto awsParamsDto) throws Exception;