Java Code Examples for org.apache.spark.api.java.JavaRDD#mapToPair()

The following examples show how to use org.apache.spark.api.java.JavaRDD#mapToPair() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
 * . Note that metadata is required for IJV format.
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param frameMetadata
 *            frame metadata
 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
 */
public static FrameObject javaRDDStringIJVToFrameObject(JavaRDD<String> javaRDD,
		FrameMetadata frameMetadata) {
	JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
	MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics()
			: new MatrixCharacteristics();

	JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());

	FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(),
			new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo),
			frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
	JavaPairRDD<Long, FrameBlock> rdd;
	try {
		ValueType[] lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
		rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
	} catch (DMLRuntimeException e) {
		e.printStackTrace();
		return null;
	}
	frameObject.setRDDHandle(new RDDObject(rdd));
	return frameObject;
}
 
Example 2
Source File: HDFSWriter.java    From ViraPipe with MIT License 6 votes vote down vote up
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritable(JavaRDD<SAMRecord> records, Broadcast<SAMFileHeader> header) {
    return records.mapToPair(read -> {

        //SEQUENCE DICTIONARY must be set here for the alignment because it's not given as header file
        //Set in alignment to sam map phase
        if(header.getValue().getSequenceDictionary()==null) header.getValue().setSequenceDictionary(new SAMSequenceDictionary());
        if(header.getValue().getSequenceDictionary().getSequence(read.getReferenceName())==null)
            header.getValue().getSequenceDictionary().addSequence(new SAMSequenceRecord(read.getReferenceName()));

        //read.setHeader(read.getHeader());
        read.setHeaderStrict(header.getValue());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
Example 3
Source File: Evaluation.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}
 
Example 4
Source File: SparkTransformExecutor.java    From DataVec with Apache License 2.0 6 votes vote down vote up
/**
 * Execute a join on the specified data
 *
 * @param join  Join to execute
 * @param left  Left data for join
 * @param right Right data for join
 * @return Joined data
 */
public static JavaRDD<List<Writable>> executeJoin(Join join, JavaRDD<List<Writable>> left,
                JavaRDD<List<Writable>> right) {

    String[] leftColumnNames = join.getJoinColumnsLeft();
    int[] leftColumnIndexes = new int[leftColumnNames.length];
    for (int i = 0; i < leftColumnNames.length; i++) {
        leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]);
    }

    JavaPairRDD<List<Writable>, List<Writable>> leftJV = left.mapToPair(new ExtractKeysFunction(leftColumnIndexes));

    String[] rightColumnNames = join.getJoinColumnsRight();
    int[] rightColumnIndexes = new int[rightColumnNames.length];
    for (int i = 0; i < rightColumnNames.length; i++) {
        rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]);
    }

    JavaPairRDD<List<Writable>, List<Writable>> rightJV =
                    right.mapToPair(new ExtractKeysFunction(rightColumnIndexes));

    JavaPairRDD<List<Writable>, Tuple2<Iterable<List<Writable>>, Iterable<List<Writable>>>> cogroupedJV =
                    leftJV.cogroup(rightJV);

    return cogroupedJV.flatMap(new ExecuteJoinFromCoGroupFlatMapFunction(join));
}
 
Example 5
Source File: InstancePartitioner.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
/**
 * Partition instances by the specified partitioning (e.g. by instance type)
 *
 * @param instances RDD of instances to partition
 * @return partitioned RDD if requested, original RDD if no partitioning is specified
 */
public JavaRDD<Instance> partition(JavaRDD<Instance> instances) {
    if (!config.isRepartitionByType()) {
        return instances;
    }
    log.info("Getting counts by type hash");
    Map<Integer, Long> typeCounts = getApproximateTypeHashCounts(instances);
    int numPartitions = instances.getNumPartitions();
    long totalInstances = instances.count();
    long instancesPerPartition = totalInstances / numPartitions + 1;

    JavaPairRDD<Integer, Instance> instanceWithPartitions = instances.mapToPair(instance -> {
        int typeHash = getTypeHash(instance);
        int splitIncrement = getSplitIncrement(instance.getId(), typeCounts.get(typeHash), instancesPerPartition);
        return new Tuple2<>(typeHash + splitIncrement, instance);
    });

    log.info("Partitioning instances by type");
    return instanceWithPartitions
            .partitionBy(new HashPartitioner(numPartitions))
            .values();
}
 
Example 6
Source File: CountLines.java    From examples with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}
 
Example 7
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> csvToBinaryBlock(JavaSparkContext sc,
		JavaRDD<String> input, DataCharacteristics mcOut,
		boolean hasHeader, String delim, boolean fill, double fillValue) 
{
	//convert string rdd to serializable longwritable/text
	JavaPairRDD<LongWritable, Text> prepinput =
		input.mapToPair(new StringToSerTextFunction());
	
	//convert to binary block
	return csvToBinaryBlock(sc, prepinput, mcOut, hasHeader, delim, fill, fillValue);
}
 
Example 8
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param parsedRDD parsed input as {@code String[]}
 * @return {@link Rating}s ordered by timestamp
 */
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
                                          Broadcast<? extends Map<String,Integer>> bUserIDToIndex,
                                          Broadcast<? extends Map<String,Integer>> bItemIDToIndex) {
  JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
    try {
      return new Tuple2<>(
          Long.valueOf(tokens[3]),
          new Rating(bUserIDToIndex.value().get(tokens[0]),
                     bItemIDToIndex.value().get(tokens[1]),
                     // Empty value means 'delete'; propagate as NaN
                     tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(tokens));
      throw e;
    }
  });

  if (decayFactor < 1.0) {
    double factor = decayFactor;
    long now = System.currentTimeMillis();
    timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
        long timestamp = timestampRating._1();
        return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
      });
  }

  if (decayZeroThreshold > 0.0) {
    double theThreshold = decayZeroThreshold;
    timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
  }

  return timestampRatingRDD.sortByKey().values();
}
 
Example 9
Source File: JavaNaiveBayesExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}
 
Example 10
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中,查询用户访问行为数据
	// 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为
	// 第二个限定:在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}
 
Example 11
Source File: HDFSWriter.java    From ViraPipe with MIT License 5 votes vote down vote up
public static JavaPairRDD<SAMRecord, SAMRecordWritable> readsToWritableNoRef(JavaRDD<SAMRecord> records) {
    return records.mapToPair(read -> {
        //read.setHeaderStrict(read.getHeader());
        read.setHeader(read.getHeader());
        final SAMRecordWritable samRecordWritable = new SAMRecordWritable();
        samRecordWritable.set(read);
        return new Tuple2<>(read, samRecordWritable);
    });
}
 
Example 12
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaRDD<String> input, DataCharacteristics mcOut, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//convert string rdd to serializable longwritable/text
	JavaPairRDD<LongWritable, Text> prepinput =
			input.mapToPair(new StringToSerTextFunction());
	
	//convert to binary block
	return csvToBinaryBlock(sc, prepinput, mcOut, schema, hasHeader, delim, fill, fillValue);
}
 
Example 13
Source File: UserVisitAnalyze.java    From UserActionAnalyzePlatform with Apache License 2.0 5 votes vote down vote up
/**
 * 将数据进行映射成为Pair,键为SessionId,Value为Row
 * @param sessionRangeDate
 * @return
 */
private static JavaPairRDD<String,Row> getSessonInfoPairRDD(JavaRDD<Row> sessionRangeDate) {
    return sessionRangeDate.mapToPair(new PairFunction<Row, String, Row>() {
        @Override
        public Tuple2<String, Row> call(Row row) throws Exception {
            return new Tuple2<String, Row>(row.getString(2),row);
        }
    });
}
 
Example 14
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaRDD<String> input, DataCharacteristics mcOut, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//convert string rdd to serializable longwritable/text
	JavaPairRDD<LongWritable, Text> prepinput =
			input.mapToPair(new StringToSerTextFunction());
	
	//convert to binary block
	return csvToBinaryBlock(sc, prepinput, mcOut, schema, hasHeader, delim, fill, fillValue);
}
 
Example 15
Source File: SparkDataSet.java    From ensemble-clustering with MIT License 5 votes vote down vote up
/***
 * The SparkDataSet loads data using this method to populate the DataSet with Instances.
 *  
 * @param path the location of the data to load (filesystem or HDFS path)
 * @param parser is the object that converts each line in the data into an Instance
 */
public void load(String path, SparkInstanceParser parser) {		
	try {
		JavaRDD<String> lines = sc.textFile(path); 
		instances = lines.mapToPair( parser );
	} catch (Exception e) {
		e.printStackTrace();
	}
}
 
Example 16
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}
 
Example 17
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                                                boolean knownItems) {
  JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());

  JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
      String user = datum[0];
      String item = datum[1];
      Boolean delete = datum[2].isEmpty();
      return knownItems ?
          new Tuple2<>(user, new Tuple2<>(item, delete)) :
          new Tuple2<>(item, new Tuple2<>(user, delete));
    });

  // TODO likely need to figure out a way to avoid groupByKey but collectByKey
  // won't work here -- doesn't guarantee enough about ordering
  return tuples.groupByKey().mapValues(idDeletes -> {
      Collection<String> ids = new HashSet<>();
      for (Tuple2<String,Boolean> idDelete : idDeletes) {
        if (idDelete._2()) {
          ids.remove(idDelete._1());
        } else {
          ids.add(idDelete._1());
        }
      }
      return ids;
    });
}
 
Example 18
Source File: PageOneStepConvertRateSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 获取<sessionid,用户访问行为>格式的数据
 * @param actionRDD 用户访问行为RDD
 * @return <sessionid,用户访问行为>格式的数据
 */
private static JavaPairRDD<String, Row> getSessionid2actionRDD(
		JavaRDD<Row> actionRDD) {
	return actionRDD.mapToPair(new PairFunction<Row, String, Row>() {

		private  final Long serialVersionUID = 1L;

		@Override
		public Tuple2<String, Row> call(Row row) throws Exception {
			String sessionid = row.getString(2);
			return new Tuple2<String, Row>(sessionid, row);   
		}
		
	});
}
 
Example 19
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param matrixMetadata
 *            matrix metadata
 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
 */
public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD,
		MatrixMetadata matrixMetadata) {
	JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
	DataCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics()
			: new MatrixCharacteristics();

	MatrixObject matrixObject = new MatrixObject(ValueType.FP64, OptimizerUtils.getUniqueTempFileName(),
			new MetaDataFormat(mc, FileFormat.CSV));
	JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
	matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
	return matrixObject;
}
 
Example 20
Source File: JavaRandomForestRegressionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Set parameters.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "variance";
  Integer maxDepth = 4;
  Integer maxBins = 32;
  Integer seed = 12345;
  // Train a RandomForest model.
  final RandomForestModel model = RandomForest.trainRegressor(trainingData,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / testData.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestRegressionModel");
  // $example off$

  jsc.stop();
}