Java Code Examples for org.apache.spark.SparkConf#set()

The following examples show how to use org.apache.spark.SparkConf#set() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingContextConfiguration.java    From Decision with Apache License 2.0 6 votes vote down vote up
private JavaStreamingContext create(String streamingContextName, int port, long streamingBatchTime, String sparkHost) {
    SparkConf conf = new SparkConf();
    conf.set("spark.ui.port", String.valueOf(port));
    conf.setAppName(streamingContextName);
    conf.setJars(JavaStreamingContext.jarOfClass(StreamingEngine.class));
    conf.setMaster(sparkHost);

    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.registerKryoClasses(new Class[] { StratioStreamingMessage.class, InsertMessage.class, ColumnType.class,
            Action.class});


    HashMap<String, String> tuningProperties = configurationContext.getSparkTunningProperties();
    if (tuningProperties != null && tuningProperties.size() > 0) {
        tuningProperties.forEach( (key, value) ->  conf.set(key, value));
    }

    JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(streamingBatchTime));

    return streamingContext;
}
 
Example 2
Source File: ChronixSparkLoader.java    From chronix.spark with Apache License 2.0 6 votes vote down vote up
public ChronixSparkContext createChronixSparkContext() throws IOException {
    if (chronixSparkContext != null) {
        return chronixSparkContext;
    }

    SparkConf sparkConf = new SparkConf()
            .setMaster(chronixYAMLConfiguration.getSparkMaster())
            .setAppName(chronixYAMLConfiguration.getAppName());

    ChronixSparkContext.tuneSparkConf(sparkConf);

    //Set spark values given in yaml config
    for (Map.Entry<String, String> setting : chronixYAMLConfiguration.getSparkSettings().entrySet()) {
        sparkConf.set(setting.getKey(), setting.getValue());
    }

    if (chronixYAMLConfiguration.isDistributed()) {
        sparkConf.setJars(chronixYAMLConfiguration.getJars());
    }

    chronixSparkContext = new ChronixSparkContext(new JavaSparkContext(sparkConf));
    return chronixSparkContext;
}
 
Example 3
Source File: SparkOnYarnContainer.java    From liteflow with Apache License 2.0 6 votes vote down vote up
private SparkConf initSparkConf(JSONObject configObj){

        String jobName = configObj.getString(CommonConstants.PARAM_EXECUTOR_JOB_NAME);

        String yarnQueue = configObj.getString(CommonConstants.SPARK_PARAM_YARN_QUEUE);
        String instanceNum = configObj.getString(CommonConstants.SPARK_PARAM_INSTANCE_NUM);

        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName(jobName);

        sparkConf.set("spark.app.name", jobName);
        sparkConf.set("spark.yarn.queue", yarnQueue);


        sparkConf.set("spark.driver.cores", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_CORES));
        sparkConf.set("spark.driver.memory", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT);
        sparkConf.set("spark.executor.cores", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_CORES));
        sparkConf.set("spark.executor.memory", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT);
        // 设置并发实例数
        Boolean isDynamicAllocation = HadoopConfig.getHadoopConf().getIsDynamicAllocation();
        if (isDynamicAllocation != null && isDynamicAllocation) {
            sparkConf.set("spark.shuffle.service.enabled", "true");
            sparkConf.set("spark.dynamicAllocation.enabled", "true");
            sparkConf.set("spark.dynamicAllocation.minExecutors", "1");
            sparkConf.set("spark.dynamicAllocation.maxExecutors", String.valueOf(instanceNum));
        } else {
            sparkConf.set("spark.executor.instances", String.valueOf(instanceNum));
        }

        /**
         * hadoop、hive配置文件
         */
        String hadoopFiles = HadoopConfig.getHadoopConf().getSparkYarnDistFiles();
        sparkConf.set("spark.yarn.dist.files", hadoopFiles + CommonConstants.COMMA + configObj.getString(Constants.JOB_CONFIG_PATH));

        return sparkConf;

    }
 
Example 4
Source File: UtilHelpers.java    From hudi with Apache License 2.0 6 votes vote down vote up
private static SparkConf buildSparkConf(String appName, String defaultMaster, Map<String, String> additionalConfigs) {
  final SparkConf sparkConf = new SparkConf().setAppName(appName);
  String master = sparkConf.get("spark.master", defaultMaster);
  sparkConf.setMaster(master);
  if (master.startsWith("yarn")) {
    sparkConf.set("spark.eventLog.overwrite", "true");
    sparkConf.set("spark.eventLog.enabled", "true");
  }
  sparkConf.setIfMissing("spark.driver.maxResultSize", "2g");
  sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
  sparkConf.set("spark.hadoop.mapred.output.compress", "true");
  sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
  sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
  sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");

  additionalConfigs.forEach(sparkConf::set);
  return HoodieWriteClient.registerClasses(sparkConf);
}
 
Example 5
Source File: SparkUtil.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static JavaSparkContext initJavaSparkConf(String name, Option<String> master,
    Option<String> executorMemory) {
  SparkConf sparkConf = new SparkConf().setAppName(name);

  String defMaster = master.orElse(sparkConf.getenv(HoodieCliSparkConfig.CLI_SPARK_MASTER));
  if ((null == defMaster) || (defMaster.isEmpty())) {
    sparkConf.setMaster(DEFAULT_SPARK_MASTER);
  } else {
    sparkConf.setMaster(defMaster);
  }

  sparkConf.set(HoodieCliSparkConfig.CLI_SERIALIZER, "org.apache.spark.serializer.KryoSerializer");
  sparkConf.set(HoodieCliSparkConfig.CLI_DRIVER_MAX_RESULT_SIZE, "2g");
  sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_OVERWRITE, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_ENABLED, "true");
  if (executorMemory.isPresent()) {
    sparkConf.set(HoodieCliSparkConfig.CLI_EXECUTOR_MEMORY, executorMemory.get());
  }

  // Configure hadoop conf
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESS, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "org.apache.hadoop.io.compress.GzipCodec");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_TYPE, "BLOCK");

  HoodieWriteClient.registerClasses(sparkConf);
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  jsc.hadoopConfiguration().setBoolean(HoodieCliSparkConfig.CLI_PARQUET_ENABLE_SUMMARY_METADATA, false);
  FSUtils.prepareHadoopConf(jsc.hadoopConfiguration());
  return jsc;
}
 
Example 6
Source File: SqoopSparkJob.java    From sqoop-on-spark with Apache License 2.0 5 votes vote down vote up
public SparkConf init(CommandLine cArgs) throws ClassNotFoundException {
  System.setProperty(ConfigurationConstants.SYSPROP_CONFIG_DIR, cArgs.getOptionValue("confDir"));
  // by default it is local, override based on the submit parameter
  SparkConf conf = new SparkConf().setAppName("sqoop-spark").setMaster("local");
  if (cArgs.getOptionValue("defaultExtractors") != null) {
    conf.set(SqoopSparkDriver.DEFAULT_EXTRACTORS, cArgs.getOptionValue("defaultExtractors"));
  }
  if (cArgs.getOptionValue("numL") != null) {
    conf.set(SqoopSparkDriver.NUM_LOADERS, cArgs.getOptionValue("numL"));
  }
  // hack to load extra classes directly
  Class.forName("com.mysql.jdbc.Driver");
  SqoopServer.initialize();
  return conf;
}
 
Example 7
Source File: JavaHBaseStreamingBulkPutExample.java    From learning-hadoop with Apache License 2.0 5 votes vote down vote up
public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkPutExample  {master} {host} {post} {tableName} {columnFamily}");
  }

  String master = args[0];
  String host = args[1];
  String port = args[2];
  String tableName = args[3];
  String columnFamily = args[4];

  System.out.println("master:" + master);
  System.out.println("host:" + host);
  System.out.println("port:" + Integer.parseInt(port));
  System.out.println("tableName:" + tableName);
  System.out.println("columnFamily:" + columnFamily);
  
  SparkConf sparkConf = new SparkConf();
  sparkConf.set("spark.cleaner.ttl", "120000");
  
  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkPutExample");
  jsc.addJar("SparkHBase.jar");
  
  JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000));

  JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port));
  
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  hbaseContext.streamBulkPut(javaDstream, tableName, new PutFunction(), true);
}
 
Example 8
Source File: GeoWaveSparkConf.java    From geowave with Apache License 2.0 5 votes vote down vote up
public static SparkSession createSessionFromParams(
    final String appName,
    String master,
    final String host,
    final String jars) {
  // Grab default config for GeoWave
  SparkConf defaultConfig = GeoWaveSparkConf.getDefaultConfig();
  // Apply master from default
  if (master == null) {
    master = "yarn";
  }

  // Apply user options if set, correctly handling host for yarn.
  if (appName != null) {
    defaultConfig = defaultConfig.setAppName(appName);
  }
  defaultConfig = defaultConfig.setMaster(master);
  if (host != null) {
    if (master != "yarn") {
      defaultConfig = defaultConfig.set("spark.driver.host", host);
    } else {
      LOGGER.warn(
          "Attempting to set spark driver host for yarn master. Normally this is handled via hadoop configuration. Remove host or set another master designation and try again.");
    }
  }

  if (jars != null) {
    defaultConfig = defaultConfig.set("spark.jars", jars);
  }

  // Finally return the session from builder
  return GeoWaveSparkConf.internalCreateSession(defaultConfig, null);
}
 
Example 9
Source File: JavaEmbeddedIgniteRDDWithLocalStoreSelfTest.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Creates default spark context
 *
 * @return Context.
 */
private JavaSparkContext createContext() {
    SparkConf conf = new SparkConf();

    conf.set("spark.executor.instances", String.valueOf(GRID_CNT));

    return new JavaSparkContext("local[" + GRID_CNT + "]", "test", conf);
}
 
Example 10
Source File: SqoopSparkClientFactory.java    From sqoop-on-spark with Apache License 2.0 5 votes vote down vote up
static SparkConf generateSparkConf(Map<String, String> conf) {
  SparkConf sparkConf = new SparkConf(false);
  for (Map.Entry<String, String> entry : conf.entrySet()) {
    sparkConf.set(entry.getKey(), entry.getValue());
  }
  return sparkConf;
}
 
Example 11
Source File: SpliceSparkWatcher.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
protected void starting(Description description) {
	super.starting(description);
	SpliceLogUtils.trace(LOG, "starting spark");
   	SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster("local");
   	sparkConf.set("spark.broadcast.compress", "false"); // Will attempt to use Snappy without this set.
   	sparkConf.set("spark.driver.allowMultipleContexts", "true"); // SPARK-2243
   	jsc = new JavaSparkContext(sparkConf);
}
 
Example 12
Source File: SparkEngineBase.java    From beakerx with Apache License 2.0 5 votes vote down vote up
protected void configureSparkConf(SparkConf sparkConf) {
  if (!sparkConf.contains(SPARK_APP_NAME)) {
    sparkConf.setAppName("beaker_" + UUID.randomUUID().toString());
  }
  if (sparkConf.contains(SPARK_MASTER) && !isLocalSpark(sparkConf)) {
    sparkConf.set(SPARK_REPL_CLASS_OUTPUT_DIR, KernelManager.get().getOutDir());
  }
}
 
Example 13
Source File: SparkIntegrationTestResource.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * @return a clean spark configuration created from the options in this resource.
 */
public SparkConf createSparkConf(String appName) {
    SparkConf conf = new SparkConf();
    conf.setAppName(appName);
    conf.setMaster(sm);
    // conf.set("spark.driver.host", "10.42.30.148");
    for (Map.Entry<String, String> kv : hadoopConf.entrySet())
        conf.set("spark.hadoop." + kv.getKey(), kv.getValue());
    return conf;
}
 
Example 14
Source File: SparkRunnerKryoRegistratorTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testDefaultSerializerNotCallingKryo() {
  SparkConf conf = new SparkConf();
  conf.set("spark.kryo.registrator", KryoRegistratorIsNotCalled.class.getName());
  runSimplePipelineWithSparkContext(conf);
}
 
Example 15
Source File: ComputeResponse.java    From incubator-retired-pirk with Apache License 2.0 4 votes vote down vote up
public ComputeResponse(FileSystem fileSys) throws PIRException
{
  fs = fileSys;
  storage = new HadoopFileSystemStore(fs);

  dataInputFormat = SystemConfiguration.getProperty("pir.dataInputFormat");
  if (!InputFormatConst.ALLOWED_FORMATS.contains(dataInputFormat))
  {
    throw new IllegalArgumentException("inputFormat = " + dataInputFormat + " is of an unknown form");
  }
  logger.info("inputFormat = " + dataInputFormat);
  if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT))
  {
    inputData = SystemConfiguration.getProperty("pir.inputData", "none");
    if (inputData.equals("none"))
    {
      throw new IllegalArgumentException("For inputFormat = " + dataInputFormat + " an inputFile must be specified");
    }
    logger.info("inputFile = " + inputData);
  }
  else if (dataInputFormat.equals(InputFormatConst.ES))
  {
    esQuery = SystemConfiguration.getProperty("pir.esQuery", "none");
    esResource = SystemConfiguration.getProperty("pir.esResource", "none");
    if (esQuery.equals("none"))
    {
      throw new IllegalArgumentException("esQuery must be specified");
    }
    if (esResource.equals("none"))
    {
      throw new IllegalArgumentException("esResource must be specified");
    }
    logger.info("esQuery = " + esQuery + " esResource = " + esResource);
  }
  outputFile = SystemConfiguration.getProperty("pir.outputFile");
  outputDirExp = outputFile + "_exp";

  queryInput = SystemConfiguration.getProperty("pir.queryInput");
  String stopListFile = SystemConfiguration.getProperty("pir.stopListFile");
  useModExpJoin = SystemConfiguration.getBooleanProperty("pir.useModExpJoin", false);

  logger.info("outputFile = " + outputFile + " queryInputDir = " + queryInput + " stopListFile = " + stopListFile + " esQuery = " + esQuery + " esResource = "
      + esResource);

  // Set the necessary configurations
  SparkConf conf = new SparkConf().setAppName("SparkPIR").setMaster("yarn-cluster");
  conf.set("es.nodes", SystemConfiguration.getProperty("es.nodes", "none"));
  conf.set("es.port", SystemConfiguration.getProperty("es.port", "none"));
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
  conf.set("spark.memory.storageFraction", "0.10");
  conf.set("spark.memory.fraction", "0.25");
  // conf.set("spark.memory.fraction", "0.25");
  // conf.set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops");
  sc = new JavaSparkContext(conf);

  // Setup, run query, teardown
  logger.info("Setting up for query run");
  try
  {
    setup();
  } catch (IOException e)
  {
    throw new PIRException("An error occurred setting up the Spark responder.", e);
  }
  logger.info("Setup complete");
}
 
Example 16
Source File: WordCountingAppWithCheckpoint.java    From tutorials with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }
 
Example 17
Source File: SparkMergingDictionary.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("scala.collection.mutable.WrappedArray$ofRef") };

    SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        KylinSparkJobListener jobListener = new KylinSparkJobListener();
        sc.sc().addSparkListener(jobListener);

        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath));

        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
        final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
        final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

        logger.info("Dictionary output path: {}", dictOutputPath);
        logger.info("Statistics output path: {}", statOutputPath);

        final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
        final int columnLength = tblColRefs.length;

        List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

        for (int i = 0; i <= columnLength; i++) {
            indexs.add(i);
        }

        JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1);

        JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName,
                metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

        colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class,
                SequenceFileOutputFormat.class);
    }
}
 
Example 18
Source File: JavaStocks.java    From spark-ts-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Spark-TS Ticker Example").setMaster("local");
  conf.set("spark.io.compression.codec", "org.apache.spark.io.LZ4CompressionCodec");
  JavaSparkContext context = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(context);

  DataFrame tickerObs = loadObservations(context, sqlContext, "../data/ticker.tsv");

  // Create an daily DateTimeIndex over August and September 2015
  ZoneId zone = ZoneId.systemDefault();
  DateTimeIndex dtIndex = DateTimeIndexFactory.uniformFromInterval(
      ZonedDateTime.of(LocalDateTime.parse("2015-08-03T00:00:00"), zone),
      ZonedDateTime.of(LocalDateTime.parse("2015-09-22T00:00:00"), zone),
      new BusinessDayFrequency(1, 0));

  // Align the ticker data on the DateTimeIndex to create a TimeSeriesRDD
  JavaTimeSeriesRDD tickerTsrdd = JavaTimeSeriesRDDFactory.timeSeriesRDDFromObservations(
      dtIndex, tickerObs, "timestamp", "symbol", "price");

  // Cache it in memory
  tickerTsrdd.cache();

  // Count the number of series (number of symbols)
  System.out.println(tickerTsrdd.count());

  // Impute missing values using linear interpolation
  JavaTimeSeriesRDD<String> filled = tickerTsrdd.fill("linear");

  // Compute return rates
  JavaTimeSeriesRDD<String> returnRates = filled.returnRates();

  // Compute Durbin-Watson stats for each series
  JavaPairRDD<String, Double> dwStats = returnRates.mapValues(
      (Vector x) -> TimeSeriesStatisticalTests.dwtest(x)
  );

  class StatsComparator implements Comparator<Tuple2<String,Double>>, java.io.Serializable {
      public int compare(Tuple2<String, Double> a, Tuple2<String, Double> b) {
          return a._2.compareTo(b._2);
      }
  }

  System.out.println(dwStats.min(new StatsComparator()));
  System.out.println(dwStats.max(new StatsComparator()));
}
 
Example 19
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 20
Source File: SparkCubingByLayer.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}