Java Code Examples for org.apache.spark.api.java.JavaSparkContext#hadoopConfiguration()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#hadoopConfiguration() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HoodieMergeOnReadTableCompactor.java From hudi with Apache License 2.0

6 votes

@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan,
    HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
  if (compactionPlan == null || (compactionPlan.getOperations() == null)
      || (compactionPlan.getOperations().isEmpty())) {
    return jsc.emptyRDD();
  }
  HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
  // Compacting is very similar to applying updates to existing file
  HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc.hadoopConfiguration(), metaClient);
  List<CompactionOperation> operations = compactionPlan.getOperations().stream()
      .map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
  LOG.info("Compactor compacting " + operations + " files");

  return jsc.parallelize(operations, operations.size())
      .map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
}

Example 2

Source File: IncrSourceHelper.java From hudi with Apache License 2.0

6 votes

/**
 * Find begin and end instants to be set for the next fetch.
 *
 * @param jssc Java Spark Context
 * @param srcBasePath Base path of Hudi source table
 * @param numInstantsPerFetch Max Instants per fetch
 * @param beginInstant Last Checkpoint String
 * @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant
 * @return begin and end instants
 */
public static Pair<String, String> calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath,
    int numInstantsPerFetch, Option<String> beginInstant, boolean readLatestOnMissingBeginInstant) {
  ValidationUtils.checkArgument(numInstantsPerFetch > 0,
      "Make sure the config hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
  HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), srcBasePath, true);

  final HoodieTimeline activeCommitTimeline =
      srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();

  String beginInstantTime = beginInstant.orElseGet(() -> {
    if (readLatestOnMissingBeginInstant) {
      Option<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
      return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse("000");
    } else {
      throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest "
          + "committed instant set hoodie.deltastreamer.source.hoodie.read_latest_on_midding_ckpt to true");
    }
  });

  Option<HoodieInstant> nthInstant = Option.fromJavaOptional(activeCommitTimeline
      .findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y));
  return Pair.of(beginInstantTime, nthInstant.map(HoodieInstant::getTimestamp).orElse(beginInstantTime));
}

Example 3

Source File: HoodieReadClient.java From hudi with Apache License 2.0

5 votes

/**
 * @param clientConfig instance of HoodieWriteConfig
 */
public HoodieReadClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) {
  this.jsc = jsc;
  this.hadoopConf = jsc.hadoopConfiguration();
  final String basePath = clientConfig.getBasePath();
  // Create a Hoodie table which encapsulated the commits and files visible
  HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath, true);
  this.hoodieTable = HoodieTable.create(metaClient, clientConfig, hadoopConf);
  this.index = HoodieIndex.createIndex(clientConfig);
  this.sqlContextOpt = Option.empty();
}

Example 4

Source File: AbstractHoodieClient.java From hudi with Apache License 2.0

5 votes

protected AbstractHoodieClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
    Option<EmbeddedTimelineService> timelineServer) {
  this.hadoopConf = jsc.hadoopConfiguration();
  this.fs = FSUtils.getFs(clientConfig.getBasePath(), hadoopConf);
  this.jsc = jsc;
  this.basePath = clientConfig.getBasePath();
  this.config = clientConfig;
  this.timelineServer = timelineServer;
  shouldStopTimelineServer = !timelineServer.isPresent();
  startEmbeddedServerView();
}

Example 5

Source File: BaseActionExecutor.java From hudi with Apache License 2.0

5 votes

public BaseActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable<?> table, String instantTime) {
  this.jsc = jsc;
  this.hadoopConf = jsc.hadoopConfiguration();
  this.config = config;
  this.table = table;
  this.instantTime = instantTime;
}

Example 6

Source File: HoodieClientTestHarness.java From hudi with Apache License 2.0

5 votes

/**
 * Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with the given application name.
 *
 * @param appName The specified application name.
 */
protected void initSparkContexts(String appName) {
  // Initialize a local spark env
  jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName));
  jsc.setLogLevel("ERROR");
  hadoopConf = jsc.hadoopConfiguration();

  // SQLContext stuff
  sqlContext = new SQLContext(jsc);
}

Example 7

Source File: CsvDFSSource.java From hudi with Apache License 2.0

5 votes

public CsvDFSSource(TypedProperties props,
    JavaSparkContext sparkContext,
    SparkSession sparkSession,
    SchemaProvider schemaProvider) {
  super(props, sparkContext, sparkSession, schemaProvider);
  this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
  if (schemaProvider != null) {
    sourceSchema = (StructType) SchemaConverters.toSqlType(schemaProvider.getSourceSchema())
        .dataType();
  } else {
    sourceSchema = null;
  }
}

Example 8

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Example 9

Source File: AvroDFSSource.java From hudi with Apache License 2.0

4 votes

public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
    SchemaProvider schemaProvider) {
  super(props, sparkContext, sparkSession, schemaProvider);
  this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
}

Example 10

Source File: JsonDFSSource.java From hudi with Apache License 2.0

4 votes

public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
    SchemaProvider schemaProvider) {
  super(props, sparkContext, sparkSession, schemaProvider);
  this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
}

Example 11

Source File: HoodieDeltaStreamer.java From hudi with Apache License 2.0

4 votes

public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
  this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
      jssc.hadoopConfiguration(), null);
}

Example 12

Source File: HoodieDeltaStreamer.java From hudi with Apache License 2.0

4 votes

public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, TypedProperties props) throws IOException {
  this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
      jssc.hadoopConfiguration(), props);
}

Example 13

Source File: HoodieCompactionAdminTool.java From hudi with Apache License 2.0

4 votes

/**
 * Executes one of compaction admin operations.
 */
public void run(JavaSparkContext jsc) throws Exception {
  HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.basePath);
  try (CompactionAdminClient admin = new CompactionAdminClient(jsc, cfg.basePath)) {
    final FileSystem fs = FSUtils.getFs(cfg.basePath, jsc.hadoopConfiguration());
    if (cfg.outputPath != null && fs.exists(new Path(cfg.outputPath))) {
      throw new IllegalStateException("Output File Path already exists");
    }
    switch (cfg.operation) {
      case VALIDATE:
        List<ValidationOpResult> res =
            admin.validateCompactionPlan(metaClient, cfg.compactionInstantTime, cfg.parallelism);
        if (cfg.printOutput) {
          printOperationResult("Result of Validation Operation :", res);
        }
        serializeOperationResult(fs, res);
        break;
      case UNSCHEDULE_FILE:
        List<RenameOpResult> r = admin.unscheduleCompactionFileId(
            new HoodieFileGroupId(cfg.partitionPath, cfg.fileId), cfg.skipValidation, cfg.dryRun);
        if (cfg.printOutput) {
          System.out.println(r);
        }
        serializeOperationResult(fs, r);
        break;
      case UNSCHEDULE_PLAN:
        List<RenameOpResult> r2 = admin.unscheduleCompactionPlan(cfg.compactionInstantTime, cfg.skipValidation,
            cfg.parallelism, cfg.dryRun);
        if (cfg.printOutput) {
          printOperationResult("Result of Unscheduling Compaction Plan :", r2);
        }
        serializeOperationResult(fs, r2);
        break;
      case REPAIR:
        List<RenameOpResult> r3 = admin.repairCompaction(cfg.compactionInstantTime, cfg.parallelism, cfg.dryRun);
        if (cfg.printOutput) {
          printOperationResult("Result of Repair Operation :", r3);
        }
        serializeOperationResult(fs, r3);
        break;
      default:
        throw new IllegalStateException("Not yet implemented !!");
    }
  }
}

Example 14

Source File: SparkCubingByLayer.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Example 15

Source File: BatchLayer.java From oryx with Apache License 2.0

4 votes

public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Batch Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  JavaSparkContext sparkContext = streamingContext.sparkContext();
  Configuration hadoopConf = sparkContext.hadoopConfiguration();

  Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint");
  log.info("Setting checkpoint dir to {}", checkpointPath);
  sparkContext.setCheckpointDir(checkpointPath.toString());

  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  Class<K> keyClass = getKeyClass();
  Class<M> messageClass = getMessageClass();
  pairDStream.foreachRDD(
      new BatchUpdateFunction<>(getConfig(),
                                keyClass,
                                messageClass,
                                keyWritableClass,
                                messageWritableClass,
                                dataDirString,
                                modelDirString,
                                loadUpdateInstance(),
                                streamingContext));

  // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs
  pairDStream.foreachRDD(new SaveToHDFSFunction<>(
      dataDirString + "/oryx",
      "data",
      keyClass,
      messageClass,
      keyWritableClass,
      messageWritableClass,
      hadoopConf));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  if (maxDataAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 dataDirString,
                                                 Pattern.compile("-(\\d+)\\."),
                                                 maxDataAgeHours));
  }
  if (maxModelAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 modelDirString,
                                                 Pattern.compile("(\\d+)"),
                                                 maxModelAgeHours));
  }

  log.info("Starting Spark Streaming");

  streamingContext.start();
}