* Copyright (c) 2015-2019, Cloudera, Inc. All Rights Reserved.
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.

package com.cloudera.labs.envelope.derive;

import com.cloudera.labs.envelope.component.ComponentFactory;
import com.cloudera.labs.envelope.component.InstantiatedComponent;
import com.cloudera.labs.envelope.component.InstantiatesComponents;
import com.cloudera.labs.envelope.component.ProvidesAlias;
import com.cloudera.labs.envelope.schema.Schema;
import com.cloudera.labs.envelope.spark.Contexts;
import com.cloudera.labs.envelope.utils.ConfigUtils;
import com.cloudera.labs.envelope.utils.MorphlineUtils;
import com.cloudera.labs.envelope.utils.SchemaUtils;
import com.cloudera.labs.envelope.validate.ProvidesValidations;
import com.cloudera.labs.envelope.validate.Validations;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigValueType;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Map;
import java.util.Set;

public class MorphlineDeriver implements Deriver, ProvidesAlias, ProvidesValidations,
    InstantiatesComponents {

  private static final Logger LOG = LoggerFactory.getLogger(MorphlineDeriver.class);

  public static final String STEP_NAME_CONFIG = "step.name";
  public static final String MORPHLINE = "morphline.file";
  public static final String MORPHLINE_ID = "morphline.id";
  public static final String SCHEMA_CONFIG = "schema";
  public static final String ERROR_ON_EMPTY = "error.on.empty";

  private String stepName;
  private StructType schema;
  private String morphlineFile;
  private String morphlineId;
  private boolean errorOnEmpty;

  public void configure(Config config) {
    LOG.trace("Configuring Morphline Deriver");

    // Designate which dependency step to act upon
    this.stepName = config.getString(STEP_NAME_CONFIG);

    // Set up the Morphline configuration, the file must be located on the local file system
    this.morphlineFile = config.getString(MORPHLINE);
    this.morphlineId = config.getString(MORPHLINE_ID);

    // Construct the StructType schema for the Rows
    this.schema = ComponentFactory.create(Schema.class, config.getConfig(SCHEMA_CONFIG), true).getSchema();

    errorOnEmpty = ConfigUtils.getOrElse(config, ERROR_ON_EMPTY, true);

  public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
    if (!dependencies.containsKey(stepName)) {
      throw new RuntimeException("Step not found in the dependencies list");

    Dataset<Row> sourceStep = dependencies.get(stepName);

    // For each partition in the DataFrame / RDD
    JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap(
        MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty));

    // Convert all the Rows into a new DataFrame
    return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema());

   * @return The generated StructType for the resulting DataFrame
  protected StructType getSchema() {
    return this.schema;

  public String getAlias() {
    return "morphline";

  public Validations getValidations() {
    return Validations.builder()
        .mandatoryPath(STEP_NAME_CONFIG, ConfigValueType.STRING)
        .mandatoryPath(MORPHLINE, ConfigValueType.STRING)
        .mandatoryPath(MORPHLINE_ID, ConfigValueType.STRING)
        .mandatoryPath(SCHEMA_CONFIG, ConfigValueType.OBJECT)
        .optionalPath(ERROR_ON_EMPTY, ConfigValueType.BOOLEAN)
  public Set<InstantiatedComponent> getComponents(Config config, boolean configure) {
    return SchemaUtils.getSchemaComponents(config, configure, SCHEMA_CONFIG);