com.amazonaws.services.elasticmapreduce.util.StepFactory Java Examples

The following examples show how to use com.amazonaws.services.elasticmapreduce.util.StepFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: emr-add-steps.java    From aws-doc-sdk-examples with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials();
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and the profile name is specified within it.",
                   e);
       }
	
	AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
	// Run a bash script using a predefined step in the StepFactory helper class
    StepFactory stepFactory = new StepFactory();
    StepConfig runBashScript = new StepConfig()
    		.withName("Run a bash script") 
    		.withHadoopJarStep(stepFactory.newScriptRunnerStep("s3://jeffgoll/emr-scripts/create_users.sh"))
    		.withActionOnFailure("CONTINUE");

    // Run a custom jar file as a step
    HadoopJarStepConfig hadoopConfig1 = new HadoopJarStepConfig()
       .withJar("s3://path/to/my/jarfolder") // replace with the location of the jar to run as a step
       .withMainClass("com.my.Main1") // optional main class, this can be omitted if jar above has a manifest
       .withArgs("--verbose"); // optional list of arguments to pass to the jar
    StepConfig myCustomJarStep = new StepConfig("RunHadoopJar", hadoopConfig1);

    AddJobFlowStepsResult result = emr.addJobFlowSteps(new AddJobFlowStepsRequest()
	  .withJobFlowId("j-xxxxxxxxxxxx") // replace with cluster id to run the steps
	  .withSteps(runBashScript,myCustomJarStep));
    
         System.out.println(result.getStepIds());

}
 
Example #2
Source File: EmrPigStepHelper.java    From herd with Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrPigStep pigStep = (EmrPigStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (pigStep.isContinueOnError() != null && pigStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(pigStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunPigScriptStep(pigStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        return new StepConfig().withName(pigStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(new StepFactory()
            .newRunPigScriptStep(pigStep.getScriptLocation().trim(),
                pigStep.getScriptArguments().toArray(new String[pigStep.getScriptArguments().size()])));
    }
}
 
Example #3
Source File: EmrHiveStepHelper.java    From herd with Apache License 2.0 5 votes vote down vote up
@Override
public StepConfig getEmrStepConfig(Object step)
{
    EmrHiveStep emrHiveStep = (EmrHiveStep) step;

    // Default ActionOnFailure is to cancel the execution and wait
    ActionOnFailure actionOnFailure = ActionOnFailure.CANCEL_AND_WAIT;

    if (emrHiveStep.isContinueOnError() != null && emrHiveStep.isContinueOnError())
    {
        // Override based on user input
        actionOnFailure = ActionOnFailure.CONTINUE;
    }

    // If there are no arguments to hive script
    if (CollectionUtils.isEmpty(emrHiveStep.getScriptArguments()))
    {
        // Just build the StepConfig object and return
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure)
            .withHadoopJarStep(new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim()));
    }
    // If there are arguments specified
    else
    {
        // For each argument, add "-d" option
        List<String> hiveArgs = new ArrayList<>();
        for (String hiveArg : emrHiveStep.getScriptArguments())
        {
            hiveArgs.add("-d");
            hiveArgs.add(hiveArg);
        }
        // Return the StepConfig object
        return new StepConfig().withName(emrHiveStep.getStepName().trim()).withActionOnFailure(actionOnFailure).withHadoopJarStep(
            new StepFactory().newRunHiveScriptStep(emrHiveStep.getScriptLocation().trim(), hiveArgs.toArray(new String[hiveArgs.size()])));
    }
}
 
Example #4
Source File: EmrDaoImpl.java    From herd with Apache License 2.0 5 votes vote down vote up
/**
 * Create the step config list of objects for hive/pig installation.
 *
 * @param emrClusterDefinition the EMR definition name value.
 *
 * @return list of step configuration that contains all the steps for the given configuration.
 */
private List<StepConfig> getStepConfig(EmrClusterDefinition emrClusterDefinition)
{
    StepFactory stepFactory = new StepFactory();
    List<StepConfig> appSteps = new ArrayList<>();

    // Create install hive step and add to the StepConfig list
    if (StringUtils.isNotBlank(emrClusterDefinition.getHiveVersion()))
    {
        StepConfig installHive =
            new StepConfig().withName("Hive " + emrClusterDefinition.getHiveVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallHiveStep(emrClusterDefinition.getHiveVersion()));
        appSteps.add(installHive);
    }

    // Create install Pig step and add to the StepConfig List
    if (StringUtils.isNotBlank(emrClusterDefinition.getPigVersion()))
    {
        StepConfig installPig =
            new StepConfig().withName("Pig " + emrClusterDefinition.getPigVersion()).withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW)
                .withHadoopJarStep(stepFactory.newInstallPigStep(emrClusterDefinition.getPigVersion()));
        appSteps.add(installPig);
    }

    // Add the hadoop jar steps that need to be added.
    if (!CollectionUtils.isEmpty(emrClusterDefinition.getHadoopJarSteps()))
    {
        for (HadoopJarStep hadoopJarStep : emrClusterDefinition.getHadoopJarSteps())
        {
            StepConfig stepConfig = emrHelper
                .getEmrHadoopJarStepConfig(hadoopJarStep.getStepName(), hadoopJarStep.getJarLocation(), hadoopJarStep.getMainClass(),
                    hadoopJarStep.getScriptArguments(), hadoopJarStep.isContinueOnError());

            appSteps.add(stepConfig);
        }
    }

    return appSteps;
}
 
Example #5
Source File: LambdaContainer.java    From aws-big-data-blog with Apache License 2.0 5 votes vote down vote up
protected String fireEMRJob(String paramsStr,String clusterId){
	StepFactory stepFactory = new StepFactory();
	AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient();
	emr.setRegion(Region.getRegion(Regions.fromName(System.getenv().get("AWS_REGION"))));
	Application sparkConfig = new Application()
			.withName("Spark");
	
	String[] params = paramsStr.split(",");
	StepConfig enabledebugging = new StepConfig()
			.withName("Enable debugging")
			.withActionOnFailure("TERMINATE_JOB_FLOW")
			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
	
	HadoopJarStepConfig sparkStepConf = new HadoopJarStepConfig()
			.withJar("command-runner.jar")
			.withArgs(params);	
	
	final StepConfig sparkStep = new StepConfig()
			.withName("Spark Step")
			.withActionOnFailure("CONTINUE")
			.withHadoopJarStep(sparkStepConf);

	
	AddJobFlowStepsRequest request = new AddJobFlowStepsRequest(clusterId)
			.withSteps(new ArrayList<StepConfig>(){{add(sparkStep);}});
			

	AddJobFlowStepsResult result = emr.addJobFlowSteps(request);
	return result.getStepIds().get(0);
}
 
Example #6
Source File: create_cluster.java    From aws-doc-sdk-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials(); // specifies any named profile in .aws/credentials as the credentials provider
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and that the profile name is defined within it.",
                   e);
       }
	
	// create an EMR client using the credentials and region specified in order to create the cluster
	AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
       // create a step to enable debugging in the AWS Management Console
	StepFactory stepFactory = new StepFactory(); 
	StepConfig enabledebugging = new StepConfig()
  			.withName("Enable debugging")
  			.withActionOnFailure("TERMINATE_JOB_FLOW")
  			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
       
       // specify applications to be installed and configured when EMR creates the cluster
	Application hive = new Application().withName("Hive");
	Application spark = new Application().withName("Spark");
	Application ganglia = new Application().withName("Ganglia");
	Application zeppelin = new Application().withName("Zeppelin");
	
	// create the cluster
	RunJobFlowRequest request = new RunJobFlowRequest()
       		.withName("MyClusterCreatedFromJava")
       		.withReleaseLabel("emr-5.20.0") // specifies the EMR release version label, we recommend the latest release
       		.withSteps(enabledebugging)
       		.withApplications(hive,spark,ganglia,zeppelin)
       		.withLogUri("s3://path/to/my/emr/logs") // a URI in S3 for log files is required when debugging is enabled
       		.withServiceRole("EMR_DefaultRole") // replace the default with a custom IAM service role if one is used
       		.withJobFlowRole("EMR_EC2_DefaultRole") // replace the default with a custom EMR role for the EC2 instance profile if one is used
       		.withInstances(new JobFlowInstancesConfig()
       	   		.withEc2SubnetId("subnet-12ab34c56")
           		.withEc2KeyName("myEc2Key") 
           		.withInstanceCount(3) 
           		.withKeepJobFlowAliveWhenNoSteps(true)    
           		.withMasterInstanceType("m4.large")
           		.withSlaveInstanceType("m4.large"));

   RunJobFlowResult result = emr.runJobFlow(request);  
   System.out.println("The cluster ID is " + result.toString());

}
 
Example #7
Source File: create-spark-cluster.java    From aws-doc-sdk-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	AWSCredentials credentials_profile = null;		
	try {
		credentials_profile = new ProfileCredentialsProvider("default").getCredentials();
       } catch (Exception e) {
           throw new AmazonClientException(
                   "Cannot load credentials from .aws/credentials file. " +
                   "Make sure that the credentials file exists and the profile name is specified within it.",
                   e);
       }
       
       AmazonElasticMapReduce emr = AmazonElasticMapReduceClientBuilder.standard()
		.withCredentials(new AWSStaticCredentialsProvider(credentials_profile))
		.withRegion(Regions.US_WEST_1)
		.build();
       
       // create a step to enable debugging in the AWS Management Console
	StepFactory stepFactory = new StepFactory(); 
	StepConfig enabledebugging = new StepConfig()
  			.withName("Enable debugging")
  			.withActionOnFailure("TERMINATE_JOB_FLOW")
  			.withHadoopJarStep(stepFactory.newEnableDebuggingStep());
       
       Application spark = new Application().withName("Spark");

       RunJobFlowRequest request = new RunJobFlowRequest()
           .withName("Spark Cluster")
           .withReleaseLabel("emr-5.20.0")
           .withSteps(enabledebugging)
           .withApplications(spark)
           .withLogUri("s3://path/to/my/logs/")
       	.withServiceRole("EMR_DefaultRole") 
       	.withJobFlowRole("EMR_EC2_DefaultRole") 
           .withInstances(new JobFlowInstancesConfig()
               .withEc2SubnetId("subnet-12ab3c45")
               .withEc2KeyName("myEc2Key")
               .withInstanceCount(3)
               .withKeepJobFlowAliveWhenNoSteps(true)
               .withMasterInstanceType("m4.large")
               .withSlaveInstanceType("m4.large")
           );			
       RunJobFlowResult result = emr.runJobFlow(request);  
    System.out.println("The cluster ID is " + result.toString());
}
 
Example #8
Source File: EmrOperatorFactory.java    From digdag with Apache License 2.0 4 votes vote down vote up
private StepFactory stepFactory()
{
    // TODO: configure region
    return new StepFactory();
}
 
Example #9
Source File: EMRUtils.java    From aws-big-data-blog with Apache License 2.0 4 votes vote down vote up
/**
 * This method uses method the AWS Java to launch an Apache HBase cluster on Amazon EMR. 
 * 
 * @param client - AmazonElasticMapReduce client that interfaces directly with the Amazon EMR Web Service
 * @param clusterIdentifier - identifier of an existing cluster
 * @param amiVersion - AMI to use for launching this cluster
 * @param keypair - A keypair for SSHing into the Amazon EMR master node
 * @param masterInstanceType - Master node Amazon EC2 instance type 
 * @param coreInstanceType - core nodes Amazon EC2 instance type 
 * @param logUri - An Amazon S3 bucket for your 
 * @param numberOfNodes - total number of nodes in this cluster including master node
 * @return
 */
public static String createCluster(AmazonElasticMapReduce client,
		String clusterIdentifier,
		String amiVersion,
		String keypair,
		String masterInstanceType,
		String coreInstanceType,
		String logUri,
		int numberOfNodes) {

	if (clusterExists(client, clusterIdentifier)) {
		LOG.info("Cluster " + clusterIdentifier + " is available");
		return clusterIdentifier;
	}
	
	//Error checking
	if (amiVersion == null || amiVersion.isEmpty()) throw new RuntimeException("ERROR: Please specify an AMI Version");
	if (keypair == null || keypair.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon Key Pair");
	if (masterInstanceType == null || masterInstanceType.isEmpty()) throw new RuntimeException("ERROR: Please specify a Master Instance Type");
	if (logUri == null || logUri.isEmpty()) throw new RuntimeException("ERROR: Please specify a valid Amazon S3 bucket for your logs.");
	if (numberOfNodes < 0) throw new RuntimeException("ERROR: Please specify at least 1 node");
	  		
	  RunJobFlowRequest request = new RunJobFlowRequest()
	    .withAmiVersion(amiVersion)
		.withBootstrapActions(new BootstrapActionConfig()
		             .withName("Install HBase")
		             .withScriptBootstrapAction(new ScriptBootstrapActionConfig()
		             .withPath("s3://elasticmapreduce/bootstrap-actions/setup-hbase")))
		.withName("Job Flow With HBAse Actions")	 
		.withSteps(new StepConfig() //enable debugging step
					.withName("Enable debugging")
					.withActionOnFailure("TERMINATE_CLUSTER")
					.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()), 
					//Start HBase step - after installing it with a bootstrap action
					createStepConfig("Start HBase","TERMINATE_CLUSTER", "/home/hadoop/lib/hbase.jar", getHBaseArgs()), 
					//add HBase backup step
					createStepConfig("Modify backup schedule","TERMINATE_JOB_FLOW", "/home/hadoop/lib/hbase.jar", getHBaseBackupArgs()))
		.withLogUri(logUri)
		.withInstances(new JobFlowInstancesConfig()
		.withEc2KeyName(keypair)
		.withInstanceCount(numberOfNodes)
		.withKeepJobFlowAliveWhenNoSteps(true)
		.withMasterInstanceType(masterInstanceType)
		.withSlaveInstanceType(coreInstanceType));

	RunJobFlowResult result = client.runJobFlow(request);
	
	String state = null;
	while (!(state = clusterState(client, result.getJobFlowId())).equalsIgnoreCase("waiting")) {
		try {
			Thread.sleep(10 * 1000);
			LOG.info(result.getJobFlowId() + " is " + state + ". Waiting for cluster to become available.");
		} catch (InterruptedException e) {

		}
		
		if (state.equalsIgnoreCase("TERMINATED_WITH_ERRORS")){
			LOG.error("Could not create EMR Cluster");
			System.exit(-1);	
		}
	}
	LOG.info("Created cluster " + result.getJobFlowId());
	LOG.info("Cluster " + clusterIdentifier + " is available");	
	return result.getJobFlowId();
}