Java Code Examples for org.apache.spark.api.java.JavaPairRDD#foreach()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#foreach() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Accumulator.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //日志输出级别
    javaSparkContext.setLogLevel("ERROR");
    //创建RDD
    JavaRDD<String> rdd = javaSparkContext.parallelize(Arrays.asList(JavaBean.origin_id, JavaBean.asset_name)).cache();

    AttackAccumulator attackAccumulator = new AttackAccumulator();
    //注册累加器
    javaSparkContext.sc().register(attackAccumulator, "attack_count");
    //生成一个随机数作为value
    JavaPairRDD<String, String> javaPairRDD = rdd.mapToPair((PairFunction<String, String, String>) s -> {
        Integer random = new Random().nextInt(10);
        return new Tuple2<>(s, s + ":" + random);
    });

    javaPairRDD.foreach((VoidFunction<Tuple2<String, String>>) tuple2 -> {
        attackAccumulator.add(tuple2._2);
    });
    System.out.println(attackAccumulator.value());
}
 
Example 2
Source File: InputFormatTest.java    From HadoopCV with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("VideoInput").setMaster("local[2]");
	JavaSparkContext sc = new JavaSparkContext(conf);
	
	Configuration hc = new org.apache.hadoop.conf.Configuration();
	JavaPairRDD<Text, HBMat> video = sc.newAPIHadoopFile("data/bike.avi", VideoInputFormat.class, Text.class, HBMat.class,hc);
	
	video.foreach(new VoidFunction<Tuple2<Text,HBMat>>() {	
		@Override
		public void call(Tuple2<Text, HBMat> tuple) throws Exception {
			HBMat image = (HBMat)tuple._2;
			System.out.print(image.getBmat().dump());
		}
	});
	
	System.out.print(video.count());
}
 
Example 3
Source File: Cartesian.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void cartesian(JavaSparkContext sc) {
    List<String> names = Arrays.asList("张三", "李四", "王五");
    List<Integer> scores = Arrays.asList(60, 70, 80);

    JavaRDD<String> namesRDD = sc.parallelize(names);
    JavaRDD<Integer> scoreRDD = sc.parallelize(scores);

    /**
	 *  =====================================
	 *   |             两个RDD进行笛卡尔积合并                                        |
	 *   |             The two RDD are Cartesian product merging     |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
    JavaPairRDD<String, Integer> cartesianRDD = namesRDD.cartesian(scoreRDD);
    
    cartesianRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
        public void call(Tuple2<String, Integer> t) throws Exception {
            System.out.println(t._1 + "\t" + t._2());
        }
    });
}
 
Example 4
Source File: ReadLocalMmtfHadoopFile.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {  
	
	if (args.length != 1) {
		System.err.println("Usage: " + ReadLocalMmtfHadoopFile.class.getSimpleName() + " <inputFilePath>");
		System.exit(1);
	}
    
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadLocalMmtfHadoopFile.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a local MMTF file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(args[0], sc);
    
    System.out.println("# structures: " + pdb.count());
    
    // print structural details
       pdb = pdb.sample(false, 0.01);
    pdb.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
    
    // close Spark
    sc.close();
}
 
Example 5
Source File: ReadLocalMmtf.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {  
	
	if (args.length != 1) {
		System.err.println("Usage: " + ReadLocalMmtf.class.getSimpleName() + " <inputFilePath>");
		System.exit(1);
	}
    
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadLocalMmtf.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a local MMTF file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readMmtfFiles(args[0], sc);
    
    // print structural details
    pdb.foreach(t -> TraverseStructureHierarchy.printStructureData(t._2));
    System.out.println("# structures: " + pdb.count());
    
    // close Spark
    sc.close();
}
 
Example 6
Source File: Interleave.java    From ViraPipe with MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 7
Source File: DecompressInterleave.java    From ViraPipe with MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    String[] ns = fst.getPath().getName().split("\\.");
    //TODO: Handle also compressed files
    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir, path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 8
Source File: Decompress.java    From ViraPipe with MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 9
Source File: InterleaveMulti.java    From ViraPipe with MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 10
Source File: BroadCastParam.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example 11
Source File: TraverseStructureHierarchy.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
public static void main(String args[]) {

		// instantiate Spark. Each Spark application needs these two lines of code.
		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfReduced.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		//	    List<String> pdbIds = Arrays.asList("5UTV"); // multiple models
		//	    List<String> pdbIds = Arrays.asList("1BZ1"); // multiple protein chains
		//      List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
		List<String> pdbIds = Arrays.asList("1HV4"); // structure with 2 bioassemblies
		//	    List<String> pdbIds = Arrays.asList("2NBK"); // single protein chain
		JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 

		pdb.foreach(t -> TraverseStructureHierarchy.printAll(t._2));      
	}
 
Example 12
Source File: MapToCathDomains.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapToCathDomains.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);

//	    List<String> pdbIds = Arrays.asList("1HV4");
	    List<String> pdbIds = Arrays.asList("1STQ");
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);
//	    String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz";
	   
	    pdb = pdb.flatMapToPair(new StructureToCathDomains(StructureToCathDomains.CATH_B_NEWEST_ALL));
	    
	    pdb.foreach(t -> TraverseStructureHierarchy.printAll(t._2));
	   
	    System.out.println("# cathDomains in 1HV4: " + pdb.count());
	    
	    sc.close();
	}
 
Example 13
Source File: WordCountJava.java    From BigDataArchitect with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

        SparkConf conf = new SparkConf();
        conf.setAppName("java-wordcount");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);

        JavaRDD<String> fileRDD = jsc.textFile("bigdata-spark/data/testdata.txt");

        JavaRDD<String> words = fileRDD.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String line) throws Exception {
                return Arrays.asList(line.split(" ")).iterator();
            }
        });

        JavaPairRDD<String, Integer> pairWord = words.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });

        JavaPairRDD<String, Integer> res = pairWord.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer oldV, Integer v) throws Exception {
                return oldV + v;
            }
        });

        res.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            public void call(Tuple2<String, Integer> value) throws Exception {
                System.out.println(value._1+"\t"+value._2);
            }
        });

//
//        RandomAccessFile rfile = new RandomAccessFile("ooxx","rw");
//
////        rfile.seek(222);
//        FileChannel channel = rfile.getChannel();
//        //  linux  fd   write(fd)  read(fd)
//
//
//        ByteBuffer b1 = ByteBuffer.allocate(1024);
//        ByteBuffer b2 = ByteBuffer.allocateDirect(1024);
//        MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_WRITE, 80, 120);
//


    }
 
Example 14
Source File: ReduceByKey.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * @category 统计文本单词个数
 * @param sc
 */
private static void reduceByKey(JavaSparkContext sc) {
	JavaRDD<String> lines = sc.textFile(Constant.LOCAL_FILE_PREX +"README.md");

	/**
	 *  ====================================================================================================== 
	 *   |                                                                     根据' '分词 扁平化输出Flatten output according to space word segmentation                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<String> wordsRDD = lines.flatMap(new FlatMapFunction<String, String>() {

		private static final long serialVersionUID = 1L;

		public Iterator<String> call(String line) throws Exception {
			List<String> words = Arrays.asList(line.split(" "));
			return words.iterator();
		}
	});

	/**
	 *  ====================================================================================================== 
	 *   |                                                                   将单词转换word:1的元组格式Converts the word to the tuple format of word:1                                                                         | 
	 *   ====================================================================================================== 
	 */
	JavaPairRDD<String, Integer> wordsCount = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {

		private static final long serialVersionUID = 1L;

		public Tuple2<String, Integer> call(String word) throws Exception {
			return new Tuple2<String, Integer>(word, 1);
		}
	});

	/**
	 *  ========================================================================================================= 
	 *   |                                                                根据元组Key(也就是单词)来分组Grouping according to the tuple Key (that is, the word)                                                                    | 
	 *   ========================================================================================================= 
	 */
	JavaPairRDD<String, Integer> resultRDD = wordsCount.reduceByKey(new Function2<Integer, Integer, Integer>() {

		private static final long serialVersionUID = 1L;

		public Integer call(Integer v1, Integer v2) throws Exception {
			return v1 + v2;
		}
	});

	resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {

		private static final long serialVersionUID = 1L;

		public void call(Tuple2<String, Integer> t) throws Exception {
			System.out.println(t._1 + "\t" + t._2());
		}
	});

	sc.close();
}
 
Example 15
Source File: Basic.java    From learning-spark-with-java with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}
 
Example 16
Source File: TestHoodieIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception {
  setUp(indexType);
  String newCommitTime = "001";
  int totalRecords = 10 + random.nextInt(20);
  List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
  JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

  metaClient = HoodieTableMetaClient.reload(metaClient);
  HoodieTable hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());

  // Test tagLocation without any entries in index
  JavaRDD<HoodieRecord> javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);

  // Insert totalRecords records
  writeClient.startCommitWithTime(newCommitTime);
  JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
  assertNoWriteErrors(writeStatues.collect());

  // Now tagLocation for these records, index should not tag them since it was a failed
  // commit
  javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
  // Now commit this & update location of records inserted and validate no errors
  writeClient.commit(newCommitTime, writeStatues);
  // Now tagLocation for these records, index should tag them correctly
  metaClient = HoodieTableMetaClient.reload(metaClient);
  hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());
  javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  Map<String, String> recordKeyToPartitionPathMap = new HashMap();
  List<HoodieRecord> hoodieRecords = writeRecords.collect();
  hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));

  assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size());
  assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
  assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null
      && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
  javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));

  JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = index.fetchRecordLocation(hoodieKeyJavaRDD, jsc, hoodieTable);
  List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
  assertEquals(totalRecords, recordLocations.collect().size());
  assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
  recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
  recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
 
Example 17
Source File: TestHoodieIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testTagLocationAndDuplicateUpdate(IndexType indexType) throws Exception {
  setUp(indexType);
  String newCommitTime = "001";
  int totalRecords = 10 + random.nextInt(20);
  List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
  JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

  HoodieTable hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());

  writeClient.startCommitWithTime(newCommitTime);
  JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
  JavaRDD<HoodieRecord> javaRDD1 = index.tagLocation(writeRecords, jsc, hoodieTable);

  // Duplicate upsert and ensure correctness is maintained
  // We are trying to approximately imitate the case when the RDD is recomputed. For RDD creating, driver code is not
  // recomputed. This includes the state transitions. We need to delete the inflight instance so that subsequent
  // upsert will not run into conflicts.
  metaClient.getFs().delete(new Path(metaClient.getMetaPath(), "001.inflight"));

  writeClient.upsert(writeRecords, newCommitTime);
  assertNoWriteErrors(writeStatues.collect());

  // Now commit this & update location of records inserted and validate no errors
  writeClient.commit(newCommitTime, writeStatues);
  // Now tagLocation for these records, hbaseIndex should tag them correctly
  metaClient = HoodieTableMetaClient.reload(metaClient);
  hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());
  JavaRDD<HoodieRecord> javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);

  Map<String, String> recordKeyToPartitionPathMap = new HashMap();
  List<HoodieRecord> hoodieRecords = writeRecords.collect();
  hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));

  assertEquals(totalRecords, javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
  assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
  assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null
      && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
  javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));

  JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = index.fetchRecordLocation(hoodieKeyJavaRDD, jsc, hoodieTable);
  List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
  assertEquals(totalRecords, recordLocations.collect().size());
  assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
  recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
  recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
 
Example 18
Source File: TestHoodieIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType) throws Exception {
  setUp(indexType);
  String newCommitTime = writeClient.startCommit();
  int totalRecords = 20 + random.nextInt(20);
  List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
  JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
  metaClient = HoodieTableMetaClient.reload(metaClient);

  // Insert 200 records
  JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
  assertNoWriteErrors(writeStatues.collect());

  // commit this upsert
  writeClient.commit(newCommitTime, writeStatues);
  HoodieTable hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());

  // Now tagLocation for these records, hbaseIndex should tag them
  JavaRDD<HoodieRecord> javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == totalRecords);

  // check tagged records are tagged with correct fileIds
  List<String> fileIds = writeStatues.map(WriteStatus::getFileId).collect();
  assert (javaRDD.filter(record -> record.getCurrentLocation().getFileId() == null).collect().size() == 0);
  List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect();

  Map<String, String> recordKeyToPartitionPathMap = new HashMap();
  List<HoodieRecord> hoodieRecords = writeRecords.collect();
  hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));

  JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = index.fetchRecordLocation(hoodieKeyJavaRDD, jsc, hoodieTable);
  List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
  assertEquals(totalRecords, recordLocations.collect().size());
  assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
  recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
  recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));

  // both lists should match
  assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds));
  // Rollback the last commit
  writeClient.rollback(newCommitTime);

  hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());
  // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
  // back commit
  javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0);
  assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
}