Java Code Examples for org.apache.spark.api.java.JavaRDD#foreach()

The following examples show how to use org.apache.spark.api.java.JavaRDD#foreach() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSparkContextProvider.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
public <T> void assertRDDEquals(String message, JavaRDD<T> expected, JavaRDD<T> result) {
    Option<Tuple3<T, Integer, Integer>> diff = JavaRDDComparisons.compareRDD(expected, result);
    if (diff.isDefined()) {
        log.error("EXPECTED");
        expected.foreach(row -> log.error(row.toString()));
        log.error("RESULT");
        result.foreach(row -> log.error(row.toString()));
        log.error("FIRST DIFF");
        Tuple3<T, Integer, Integer> diffTriple = diff.get();
        log.error(diffTriple.toString());
        if (diffTriple._2() == 0) {
            log.error("(row not expected but present in result {} times)", diffTriple._3());
        }
        if (diffTriple._3() == 0) {
            log.error("(row expected {} times but not present)", diffTriple._2());
        }
        throw new AssertionError(message);
    }
}
 
Example 2
Source File: InterleaveMulti.java    From ViraPipe with MIT License 6 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 3
Source File: SparkOperatorProfiler.java    From rheem with Apache License 2.0 6 votes vote down vote up
/**
 * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}.
 */
protected <T> JavaRDD<T> prepareInputRddInDriver(long cardinality, int inputIndex) {
    @SuppressWarnings("unchecked")
    final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex);
    JavaRDD<T> finalInputRdd = null;

    // Create batches, parallelize them, and union them.
    long remainder = cardinality;
    do {
        int batchSize = (int) Math.min(remainder, this.dataQuantumGeneratorBatchSize);
        List<T> batch = new ArrayList<>(batchSize);
        while (batch.size() < batchSize) {
            batch.add(supplier.get());
        }
        final JavaRDD<T> batchRdd = this.sparkExecutor.sc.parallelize(batch);
        finalInputRdd = finalInputRdd == null ? batchRdd : finalInputRdd.union(batchRdd);
        remainder -= batchSize;
    } while (remainder > 0);

    // Shuffle and cache the RDD.
    final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache();
    cachedInputRdd.foreach(dataQuantum -> {
    });

    return cachedInputRdd;
}
 
Example 4
Source File: Intersection.java    From SparkDemo with MIT License 6 votes vote down vote up
static void intersection(JavaSparkContext sc) {
	List<String> datas1 = Arrays.asList("张三", "李四", "tom");
	List<String> datas2 = Arrays.asList("tom", "gim");

	/**
	 *  =====================================
	 *   |             返回两个RDD的交集                                                   |
	 *   |             Returns the intersection of two RDD                    |                                                                                                                                                                                                                                    | 
	 *   =====================================
	 */
	JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2));

	intersectionRDD.foreach(new VoidFunction<String>() {

		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

}
 
Example 5
Source File: SampleAndTake.java    From SparkDemo with MIT License 6 votes vote down vote up
static void sample(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> dataRDD = sc.parallelize(datas);
	
	/**
	 *  ====================================================================================================== 
	 *   |                   随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子                       |
	 *   |                   The random  sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times,  |
	 *   |                   and false indicates no return.  Fraction represents the sampling proportion;seed is the random number seed                                                               |                                                                                                                                                                                                                                           | 
	 *   ====================================================================================================== 
	 */
	JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis());
	
	// TODO dataRDD.takeSample(false, 3);
	// TODO dataRDD.take(3)

	sampleRDD.foreach(new VoidFunction<Integer>() {
		@Override
		public void call(Integer t) throws Exception {
			System.out.println(t);
		}
	});

	sc.close();
}
 
Example 6
Source File: Distinct.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void distinct(JavaSparkContext sc) {
	List<String> datas = Arrays.asList("张三", "李四", "tom", "张三");

	 /**
	 *  ===================================
	 *   |      去重--包含shuffle操作                                                 |
	 *   |      Remove weights, including shuffle operations    |                                                                                                                                                                                                                                    | 
	 *   ===================================
	 */
	JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct();
	
	distinctRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});
}
 
Example 7
Source File: Filter.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void filter(JavaSparkContext sc) {
	List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8);

	JavaRDD<Integer> rddData = sc.parallelize(datas);
	JavaRDD<Integer> filterRDD = rddData.filter(
			// jdk1.8
			// v1 -> v1 >= 3
			new Function<Integer, Boolean>() {
				public Boolean call(Integer v) throws Exception {
					// 过滤小于4的数
					return v >= 4;
				}
			});

	filterRDD.foreach(
			// jdk1.8
			// v -> System.out.println(v)
			new VoidFunction<Integer>() {
				@Override
				public void call(Integer integer) throws Exception {
					System.out.println(integer);
				}
			});
	sc.close();
}
 
Example 8
Source File: FlatMap.java    From SparkDemo with MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}
 
Example 9
Source File: Union.java    From SparkDemo with MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example 10
Source File: UtilHelpers.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD<WriteStatus> writeResponse) {
  Accumulator<Integer> errors = jsc.accumulator(0);
  writeResponse.foreach(writeStatus -> {
    if (writeStatus.hasErrors()) {
      errors.add(1);
      LOG.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString()));
    }
  });
  if (errors.value() == 0) {
    LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime));
    return 0;
  }
  LOG.error(String.format("Import failed with %d errors.", errors.value()));
  return -1;
}
 
Example 11
Source File: SparkOperatorProfiler.java    From rheem with Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}.
 */
protected <T> JavaRDD<T> prepareInputRddInWorker(long cardinality, int inputIndex) {

    // Create batches, parallelize them, and union them.
    final List<Integer> batchSizes = new LinkedList<>();
    int numFullBatches = (int) (cardinality / this.dataQuantumGeneratorBatchSize);
    for (int i = 0; i < numFullBatches; i++) {
        batchSizes.add(this.dataQuantumGeneratorBatchSize);
    }
    batchSizes.add((int) (cardinality % this.dataQuantumGeneratorBatchSize));

    @SuppressWarnings("unchecked")
    final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex);
    JavaRDD<T> finalInputRdd = this.sparkExecutor.sc
            .parallelize(batchSizes, 1) // Single partition to ensure the same data generator.
            .flatMap(batchSize -> {
                List<T> list = new ArrayList<>(batchSize);
                for (int i = 0; i < batchSize; i++) {
                    list.add(supplier.get());
                }
                return list.iterator();
            });
    // Shuffle and cache the RDD.
    final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache();
    cachedInputRdd.foreach(dataQuantum -> {
    });

    return cachedInputRdd;
}
 
Example 12
Source File: SparkSegmentTarPushJob.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
@Override
public void run()
    throws Exception {
  if (!_enableParallelPush) {
    super.run();
  } else {
    List<Path> segmentPathsToPush = getDataFilePaths(_segmentPattern);
    retainRecentFiles(segmentPathsToPush, _lookBackPeriod);
    List<String> segmentsToPush = new ArrayList<>();
    segmentPathsToPush.forEach(path -> {
      segmentsToPush.add(path.toString());
    });
    JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate());
    if (_pushJobParallelism == -1) {
      _pushJobParallelism = segmentsToPush.size();
    }
    JavaRDD<String> pathRDD = sparkContext.parallelize(segmentsToPush, _pushJobParallelism);
    pathRDD.foreach(segmentTarPath -> {
      try (ControllerRestApi controllerRestApi = getControllerRestApi()) {
        FileSystem fileSystem = FileSystem.get(new Path(segmentTarPath).toUri(), new Configuration());
        // TODO: Deal with invalid prefixes in the future
        List<String> currentSegments = controllerRestApi.getAllSegments("OFFLINE");
        controllerRestApi.pushSegments(fileSystem, Arrays.asList(new Path(segmentTarPath)));
        if (_deleteExtraSegments) {
          controllerRestApi
              .deleteSegmentUris(getSegmentsToDelete(currentSegments, Arrays.asList(new Path(segmentTarPath))));
        }
      }
    });
  }
}
 
Example 13
Source File: Interleave.java    From ViraPipe with MIT License 5 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 14
Source File: Decompress.java    From ViraPipe with MIT License 5 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 15
Source File: ActionRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 循环
 *
 * @since hui_project 1.0.0
 */
public void testForEach(){
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    stringJavaRDD.foreach(x->{
        System.out.println(x);
    });
}
 
Example 16
Source File: PersistExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
/**
	 * @param args
	 */
	public static void main(String[] args) {
		//C:\Users\sumit.kumar\Downloads\bin\warehouse
		//System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads");
		String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system
		Logger rootLogger = LogManager.getRootLogger();
		rootLogger.setLevel(Level.WARN);
		 SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false");
			JavaSparkContext sparkContext = new JavaSparkContext(conf);
		    JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache();	
		    JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() {
			@Override
			public Boolean call(Integer v1) throws Exception {
			  return ((v1%2)==0)?true:false;
				}
			});
		    
		    evenRDD.persist(StorageLevel.MEMORY_AND_DISK());
		    evenRDD.foreach(new VoidFunction<Integer>() {
			@Override
			public void call(Integer t) throws Exception {
			System.out.println("The value of RDD are :"+t);
			 }
			});
		   //unpersisting the RDD 
		   evenRDD.unpersist();
		   rdd.unpersist();
		   
		   /* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache();
		    System.out.println("DEBUG: \n"+ lines.toDebugString());
		   long word= lines.count();
		   JavaRDD<String> distinctLines=lines.distinct();
		   System.out.println("DEBUG: \n"+ distinctLines.toDebugString());
		   JavaRDD<String> finalRdd=lines.subtract(distinctLines);
		    
		   
		   System.out.println("DEBUG: \n"+ finalRdd.toDebugString());
		   System.out.println("The count is "+word);
		   System.out.println("The count is "+distinctLines.count());
		   System.out.println("The count is "+finalRdd.count());
		   
		   finalRdd.foreach(new VoidFunction<String>() {
			
			@Override
			public void call(String t) throws Exception {
				// TODO Auto-generated method stub
				System.out.println(t);
			}
		});
*/	    /*SparkConf conf = new SparkConf().setAppName("Simple Application");
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    StorageLevel newLevel;
		JavaRDD<String> logData = sc.textFile(logFile).cache();

	    long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("a"); }
	    }).count();

	    long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) {
	      public Boolean call(String s) { return s.contains("b"); }
	    }).count();

	    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
	    
	    sc.stop();*/

	}
 
Example 17
Source File: TestNd4jKryoSerialization.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testSerialization() {

    Tuple2<INDArray, INDArray> t2 = new Tuple2<>(Nd4j.linspace(1, 10, 10, DataType.FLOAT), Nd4j.linspace(10, 20, 10, DataType.FLOAT));

    Broadcast<Tuple2<INDArray, INDArray>> b = sc.broadcast(t2);

    List<INDArray> list = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
        list.add(Nd4j.ones(5));
    }

    JavaRDD<INDArray> rdd = sc.parallelize(list);

    rdd.foreach(new AssertFn(b));
}
 
Example 18
Source File: JsonFileOperations.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN); 
	      SparkSession sparkSession = SparkSession
	      .builder()
	      .master("local")
		  .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse")
	      .appName("JavaALSExample")
	      .getOrCreate();
	      
	   RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); 
	   
	   JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class));
	   
	   mapParser.foreach(t -> System.out.println(t)); 
	  
	   Dataset<Row> anotherPeople = sparkSession.read().json(textFile);
	   
	   anotherPeople.printSchema();
	   anotherPeople.show();
	      
	      
	      Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json");
	      json_rec.printSchema();
	      
	      json_rec.show();
	      
	      StructType schema = new StructType( new StructField[] {
	    	            DataTypes.createStructField("cid", DataTypes.IntegerType, true),
	    	            DataTypes.createStructField("county", DataTypes.StringType, true),
	    	            DataTypes.createStructField("firstName", DataTypes.StringType, true),
	    	            DataTypes.createStructField("sex", DataTypes.StringType, true),
	    	            DataTypes.createStructField("year", DataTypes.StringType, true),
	    	            DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) });
	      
	    /*  StructType pep = new StructType(new StructField[] {
					new StructField("Count", DataTypes.StringType, true, Metadata.empty()),
					new StructField("County", DataTypes.StringType, true, Metadata.empty()),
					new StructField("First Name", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Sex", DataTypes.StringType, true, Metadata.empty()),
					new StructField("Year", DataTypes.StringType, true, Metadata.empty()),
				    new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/
	      
	     Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile);
	     
	     person_mod.printSchema();
	     person_mod.show();
	     
	     person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json");

}
 
Example 19
Source File: TestHoodieIndex.java    From hudi with Apache License 2.0 4 votes vote down vote up
@ParameterizedTest
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception {
  setUp(indexType);
  String newCommitTime = "001";
  int totalRecords = 10 + random.nextInt(20);
  List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
  JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);

  metaClient = HoodieTableMetaClient.reload(metaClient);
  HoodieTable hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());

  // Test tagLocation without any entries in index
  JavaRDD<HoodieRecord> javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);

  // Insert totalRecords records
  writeClient.startCommitWithTime(newCommitTime);
  JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
  assertNoWriteErrors(writeStatues.collect());

  // Now tagLocation for these records, index should not tag them since it was a failed
  // commit
  javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
  // Now commit this & update location of records inserted and validate no errors
  writeClient.commit(newCommitTime, writeStatues);
  // Now tagLocation for these records, index should tag them correctly
  metaClient = HoodieTableMetaClient.reload(metaClient);
  hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration());
  javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
  Map<String, String> recordKeyToPartitionPathMap = new HashMap();
  List<HoodieRecord> hoodieRecords = writeRecords.collect();
  hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath()));

  assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size());
  assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count());
  assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null
      && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
  javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch"));

  JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey());
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = index.fetchRecordLocation(hoodieKeyJavaRDD, jsc, hoodieTable);
  List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect();
  assertEquals(totalRecords, recordLocations.collect().size());
  assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count());
  recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey"));
  recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch"));
}
 
Example 20
Source File: DecompressInterleave.java    From ViraPipe with MIT License 3 votes vote down vote up
private static void splitFastq(FileStatus fst, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    //TODO: Handle also compressed files
    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, new Configuration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

    splitRDD.foreach( split ->  {

      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
      writeFastqFile(fqreader, new Configuration(), splitDir + "/" + split.getPath().getName()+"_"+split.getStart() + ".fq");

     });
  }