Java Code Examples for org.apache.spark.api.java.JavaPairRDD#count()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#count() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkTableChecker.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Check invalid indexes which do not refer to a base table row
 * @param table
 * @param index
 * @return
 * @throws StandardException
 */
private List<String> checkInvalidIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    List<String> messages = Lists.newLinkedList();
    SpliceSpark.pushScope(String.format("Check invalidate index from %s.%s", schemaName, indexName));
    PairDataSet<String, Tuple2<byte[], ExecRow>> d1 = index.subtractByKey(table, null);
    SpliceSpark.popScope();
    JavaPairRDD rdd = ((SparkPairDataSet) d1).rdd;
    invalidIndexCount = rdd.count();
    if (invalidIndexCount > 0) {
        if (fix) {
            return fixInvalidIndexes(rdd);
        } else {
            return reportInvalidIndexes(rdd);
        }
    }
    return messages;
}
 
Example 2
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public void cacheMatrixObject( String var ) {
	//get input rdd and default storage level
	MatrixObject mo = getMatrixObject(var);

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) )
		return;

	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//persist rdd (force rdd caching, if not already cached)
	if( !isRDDCached(in.id()) )
		in.count(); //trigger caching to prevent contention
}
 
Example 3
Source File: SparkExecutionContext.java    From systemds with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public void cacheMatrixObject( String var ) {
	//get input rdd and default storage level
	MatrixObject mo = getMatrixObject(var);

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(mo.getDataCharacteristics())) )
		return;

	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//persist rdd (force rdd caching, if not already cached)
	if( !isRDDCached(in.id()) )
		in.count(); //trigger caching to prevent contention
}
 
Example 4
Source File: StructureToProteinDimersTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void test4() throws IOException {
    List<String> pdbIds = Arrays.asList("1BZ5");//D5
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
    /*
     * C5-B4
     * C6-B3
     * D7-A2
     * D8-A1
     * E10-E9
     */
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(9, 10, false, true));
    long count = pdb.count();
    assertEquals(5, count);
}
 
Example 5
Source File: StructureToProteinDimersTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void test3() throws IOException {
    List<String> pdbIds = Arrays.asList("4GIS");//D4
    /*
     * A3-A2
     * A4-A1
     * B5-A1
     * B6-A2
     * B6-B5
     * B7-A3
     * B7-A4
     * B8-A4
     * B8-B7
     */
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(9, count);
}
 
Example 6
Source File: JavaTC.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}
 
Example 7
Source File: SparkTableChecker.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Check base table whether all rows are indexed
 * @param table
 * @param index
 * @return
 * @throws StandardException
 */
private List<String> checkMissingIndexes(PairDataSet table, PairDataSet index) throws StandardException {
    List<String> messages = Lists.newLinkedList();
    SpliceSpark.pushScope(String.format("Check unindexed rows from table %s.%s", schemaName, tableName));
    PairDataSet<String, ExecRow> d2 = table.subtractByKey(index, null);

    JavaPairRDD rdd = ((SparkPairDataSet)d2).rdd;
    missingIndexCount = rdd.count();
    if (missingIndexCount > 0) {
        messages = reportMissingIndexes(rdd, fix);
    }

    return  messages;
}
 
Example 8
Source File: HoodieBloomIndex.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
                                            HoodieTable<T> hoodieTable) {

  // Step 0: cache the input record RDD
  if (config.getBloomIndexUseCaching()) {
    recordRDD.persist(SparkConfigUtils.getBloomIndexInputStorageLevel(config.getProps()));
  }

  // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
  JavaPairRDD<String, String> partitionRecordKeyPairRDD =
      recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));

  // Lookup indexes for all the partition/recordkey pair
  JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
      lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);

  // Cache the result, for subsequent stages.
  if (config.getBloomIndexUseCaching()) {
    keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
  }
  if (LOG.isDebugEnabled()) {
    long totalTaggedRecords = keyFilenamePairRDD.count();
    LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
  }

  // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
  // Cost: 4 sec.
  JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);

  if (config.getBloomIndexUseCaching()) {
    recordRDD.unpersist(); // unpersist the input Record RDD
    keyFilenamePairRDD.unpersist();
  }
  return taggedRecordRDD;
}
 
Example 9
Source File: StructureToProteinDimersTest.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void test2() throws IOException {
    List<String> pdbIds = Arrays.asList("5NV3"); // D4
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(12, count);
}
 
Example 10
Source File: StructureToProteinDimersTest.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
@Test
public void test1() throws IOException {
    List<String> pdbIds = Arrays.asList("1I1G"); // D4
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 
   
	
	pdb = pdb.flatMapToPair(new StructureToBioassembly()).flatMapToPair(new StructureToProteinDimers(8, 20, false, true));
    long count = pdb.count();
    assertEquals(4, count);
}
 
Example 11
Source File: SparkCubingByLayer.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}
 
Example 12
Source File: SparkCubingByLayer.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}
 
Example 13
Source File: SparkColumnCardinality.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME);
    String output = optionsHelper.getOptionValue(OPTION_OUTPUT);
    int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT));

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName);
    //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException"
    conf.set("spark.sql.catalogImplementation", "hive");
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output));
        // table will be loaded by spark sql, so isSequenceFile set false
        final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName);
        JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter())
                .reduceByKey((x, y) -> {
                    x.merge(y);
                    return x;
                })
                .mapToPair(record -> {
                    return new Tuple2<>(record._1, record._2.getCountEstimate());
                })
                .sortByKey(true, 1)
                .cache();

        if (resultRdd.count() == 0) {
            ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>();
            for (int i = 0; i < columnCnt; ++i) {
                list.add(new Tuple2<>(i, 0L));
            }
            JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1);
            nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        } else {
            resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        }
    }
}
 
Example 14
Source File: PMapmmSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}
 
Example 15
Source File: PMapmmSPInstruction.java    From systemds with Apache License 2.0 4 votes vote down vote up
@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}
 
Example 16
Source File: GeoWaveSparkSQLIT.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Test
public void testCreateDataFrame() throws Exception {
  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();
  final SparkContext context = session.sparkContext();

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  try {
    // Load RDD from datastore, no filters
    final GeoWaveRDD newRDD = GeoWaveRDDLoader.loadRDD(context, dataStore, new RDDOptions());
    final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd = newRDD.getRawRDD();

    final long count = javaRdd.count();
    LOGGER.warn("DataStore loaded into RDD with " + count + " features.");

    queryRunner.addInputStore(dataStore, null, "features");

    final String bbox = "POLYGON ((-94 34, -93 34, -93 35, -94 35, -94 34))";

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomContains(GeomFromWKT('" + bbox + "'), geom)");

    Dataset<Row> results = queryRunner.run();
    final long containsCount = results.count();
    LOGGER.warn("Got " + containsCount + " for GeomContains test");

    queryRunner.setSql(
        "SELECT * FROM features WHERE GeomWithin(geom, GeomFromWKT('" + bbox + "'))");
    results = queryRunner.run();
    final long withinCount = results.count();
    LOGGER.warn("Got " + withinCount + " for GeomWithin test");

    Assert.assertTrue("Within and Contains counts should be equal", containsCount == withinCount);

    // Test the output writer
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, dataStore);

    sqlResultsWriter.writeResults("sqltest");

    queryRunner.removeAllStores();

    // Test other spatial UDFs
    final String line1 = "LINESTRING(0 0, 10 10)";
    final String line2 = "LINESTRING(0 10, 10 0)";
    queryRunner.setSql(
        "SELECT GeomIntersects(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    Row result = queryRunner.run().head();

    final boolean intersect = result.getBoolean(0);
    LOGGER.warn("GeomIntersects returned " + intersect);

    Assert.assertTrue("Lines should intersect", intersect);

    queryRunner.setSql(
        "SELECT GeomDisjoint(GeomFromWKT('" + line1 + "'), GeomFromWKT('" + line2 + "'))");
    result = queryRunner.run().head();

    final boolean disjoint = result.getBoolean(0);
    LOGGER.warn("GeomDisjoint returned " + disjoint);

    Assert.assertFalse("Lines should not be disjoint", disjoint);

  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while testing a bounding box query of spatial index: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}
 
Example 17
Source File: RP_DBSCAN.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
/**
 * Phase I : pre-processing for RP-DBSCAN.
 * Phase I-1 (Pseudo Random Partitioning) and Phase I-2 (Cell_Dictionary_Building & Broadcasting)
 */
public void phaseI()
{
	/**
	 * Phase I-1. Pseudo Random Partitioning
	 */
	
	//Read input data set from HDFS
	JavaRDD<String> lines = sc.textFile(Conf.inputPath, Conf.numOfPartitions);
	JavaPairRDD<List<Integer>, ApproximatedCell> dataMap = null;
	
	//Data partitioning
	if(Conf.boost)
	{
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon))
		.combineByKey(new Methods.CreateLocalApproximatedPoint(Conf.dim, Conf.epsilon, Conf.rho), new Methods.LocalApproximation(Conf.dim, Conf.epsilon, Conf.rho), new Methods.GlobalApproximation(Conf.dim))
		.mapToPair(new Methods.PseudoRandomPartition2(Conf.metaBlockWindow)).persist(StorageLevel.MEMORY_AND_DISK_SER());
	}else
		dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)).groupByKey().mapToPair(new Methods.PseudoRandomPartition(Conf.dim, Conf.epsilon, Conf.rho, Conf.metaBlockWindow, Conf.pairOutputPath)).persist(StorageLevel.MEMORY_AND_DISK_SER());

	numOfCells = dataMap.count();

	/**
	 * Phase I-2. Cell_Dictionary_Building & Broadcasting
	 */
	//Dictionary Defragmentation
	JavaPairRDD<List<Integer>, Long> ptsCountforEachMetaBlock = dataMap.mapToPair(new Methods.MetaBlockMergeWithApproximation()).reduceByKey(new Methods.AggregateCount());
	List<Tuple2<List<Integer>, Long>> numOfPtsInCell = ptsCountforEachMetaBlock.collect();
	//System.out.println("# of Blocks for virtually combining : " + numOfPtsInCell.size());
			
	HashMap<List<Integer>,List<Integer>> partitionIndex = new HashMap<List<Integer>,List<Integer>>();
	Tuple2<Long, List<Partition>> metaInfoForVirtualCombining = Methods.scalablePartition(numOfPtsInCell, Conf.dim, Conf.numOflvhCellsInMetaPartition/Conf.dim, partitionIndex);
	numOfSubCells = metaInfoForVirtualCombining._1;
	List<Partition> wholePartitions = metaInfoForVirtualCombining._2;
	numOfSubDictionaries = wholePartitions.size();	
			
	//Build Two-Level Cell Dictionary composed of multiple sub-dictionaries
	JavaPairRDD<Integer, Iterable<ApproximatedCell>> evenlySplitPartitions = dataMap.flatMapToPair(new Methods.AssignApproximatedPointToPartition(partitionIndex)).groupByKey(wholePartitions.size());
	JavaPairRDD<Null, Null> metaDataSet = evenlySplitPartitions.mapToPair(new Methods.MetaGenerationWithApproximation(Conf.dim, Conf.epsilon, Conf.rho, Conf.minPts, conf, wholePartitions));
	metaDataSet.collect();
	
	//Re-partition the pseudo random partitions into Each Worker by a randomly assigned integer value for reducing the size of memory usage.
	dataset = dataMap.mapToPair(new Methods.Repartition(Conf.numOfPartitions)).repartition(Conf.numOfPartitions).persist(StorageLevel.MEMORY_AND_DISK_SER());

	//Broadcast two-level cell dictionary to every workers.
	try {
		metaPaths = FileIO.broadCastData(sc, conf, Conf.metaFoler);
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}
 
Example 18
Source File: SparkColumnCardinality.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String tableName = optionsHelper.getOptionValue(OPTION_TABLE_NAME);
    String output = optionsHelper.getOptionValue(OPTION_OUTPUT);
    int columnCnt = Integer.valueOf(optionsHelper.getOptionValue(OPTION_COLUMN_COUNT));

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Calculate table:" + tableName);
    //set spark.sql.catalogImplementation=hive, If it is not set, SparkSession can't read hive metadata, and throw "org.apache.spark.sql.AnalysisException"
    conf.set("spark.sql.catalogImplementation", "hive");
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(output));
        // table will be loaded by spark sql, so isSequenceFile set false
        final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(false, sc, null, tableName);
        JavaPairRDD<Integer, Long> resultRdd = recordRDD.mapPartitionsToPair(new BuildHllCounter())
                .reduceByKey((x, y) -> {
                    x.merge(y);
                    return x;
                })
                .mapToPair(record -> {
                    return new Tuple2<>(record._1, record._2.getCountEstimate());
                })
                .sortByKey(true, 1)
                .cache();

        if (resultRdd.count() == 0) {
            ArrayList<Tuple2<Integer, Long>> list = new ArrayList<>();
            for (int i = 0; i < columnCnt; ++i) {
                list.add(new Tuple2<>(i, 0L));
            }
            JavaPairRDD<Integer, Long> nullRdd = sc.parallelizePairs(list).repartition(1);
            nullRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        } else {
            resultRdd.saveAsNewAPIHadoopFile(output, IntWritable.class, LongWritable.class, TextOutputFormat.class);
        }
    }
}
 
Example 19
Source File: ModelReplicaSplit.java    From OpenDL with Apache License 2.0 2 votes vote down vote up
/**
 * Split the input samples (one each split for one ModelReplica)
 * 
 * @param input
 * @param nrModelReplica
 * @param cache
 * @return
 */
public JavaPairRDD<Integer, List<T>> split(JavaRDD<T> input, int nrModelReplica, SGDTrainConfig config) {
    JavaPairRDD<Integer, List<T>> output = input.map(new SplitModelReplica(nrModelReplica)).groupByKey().persist(config.getMrDataStorage());
    output.count();
    return output;
}