org.apache.spark.mllib.recommendation.MatrixFactorizationModel Java Examples

The following examples show how to use org.apache.spark.mllib.recommendation.MatrixFactorizationModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ALSTest.java    From DDF with Apache License 2.0 6 votes vote down vote up
@Ignore
public void TestALS() throws DDFException {
  createTableRatings();

  DDF ratings = manager.sql2ddf("select userid, movieid, score from ratings", false);
  int rank = 3;
  double lambda = 10;
  int iterNum = 15;

  MatrixFactorizationModel model = (MatrixFactorizationModel) ratings.ML.train("collaborativeFiltering", rank,
      iterNum, lambda).getRawModel();

  double r = model.predict(1, 4);
  System.out.println(">>>RATING: " + r);
  manager.shutdown();
}
 
Example #2
Source File: Evaluation.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}
 
Example #3
Source File: JavaALS.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    if (args.length < 4) {
      System.err.println(
        "Usage: JavaALS <ratings_file> <rank> <iterations> <output_dir> [<blocks>]");
      System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaALS");
    int rank = Integer.parseInt(args[1]);
    int iterations = Integer.parseInt(args[2]);
    String outputDir = args[3];
    int blocks = -1;
    if (args.length == 5) {
      blocks = Integer.parseInt(args[4]);
    }

    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaRDD<Rating> ratings = lines.map(new ParseRating());

    MatrixFactorizationModel model = ALS.train(ratings.rdd(), rank, iterations, 0.01, blocks);

    model.userFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/userFeatures");
    model.productFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/productFeatures");
    System.out.println("Final user/product features written to " + outputDir);

    sc.stop();
  }
 
Example #4
Source File: ALSUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext,
                                                      PMML pmml,
                                                      Path modelParentPath,
                                                      Broadcast<Map<String,Integer>> bUserIDToIndex,
                                                      Broadcast<Map<String,Integer>> bItemIDToIndex) {
  String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
  String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
  JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
  JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
  int rank = userRDD.first()._2().length;
  return new MatrixFactorizationModel(
      rank,
      readAndConvertFeatureRDD(userRDD, bUserIDToIndex),
      readAndConvertFeatureRDD(productRDD, bItemIDToIndex));
}
 
Example #5
Source File: Evaluation.java    From oryx with Apache License 2.0 5 votes vote down vote up
private static JavaPairRDD<Integer,Iterable<Rating>> predictAll(
    MatrixFactorizationModel mfModel,
    JavaRDD<Rating> data,
    JavaPairRDD<Integer,Integer> userProducts) {
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> userProductsRDD =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) userProducts.rdd();
  return data.wrapRDD(mfModel.predict(userProductsRDD)).groupBy(Rating::user);
}
 
Example #6
Source File: CollabFilterCassandra7.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public MatrixFactorizationModel train(JavaSparkContext sparkCtx, CassandraConnector cassandraConnector) {
	CassandraJavaRDD<CassandraRow> trainingRdd = javaFunctions(sparkCtx).cassandraTable(RatingDO.EMPLOYERRATINGS_KEYSPACE, RatingDO.RATINGS_TABLE);
	JavaRDD<Rating> trainingJavaRdd = trainingRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow trainingRow) throws Exception {
			return new Rating(trainingRow.getInt(RatingDO.USER_COL), trainingRow.getInt(RatingDO.PRODUCT_COL), trainingRow.getDouble(RatingDO.RATING_COL));
		}
	});
	MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingJavaRdd), RANK, ITER, LAMBDA);
	return model;
}
 
Example #7
Source File: CollabFilterCassandra7.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
public JavaRDD<Rating> predict(MatrixFactorizationModel model, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	RDD<Tuple2<Object, Object>> validationsRdd = JavaRDD.toRDD(validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Tuple2<Object, Object>>() {
		@Override
		public Tuple2<Object, Object> call(CassandraRow validationRow) throws Exception {
			return new Tuple2<Object, Object>(validationRow.getInt(RatingDO.USER_COL), validationRow.getInt(RatingDO.PRODUCT_COL));
		}
	}));
	JavaRDD<Rating> predictionJavaRdd = model.predict(validationsRdd).toJavaRDD();
	return predictionJavaRdd;
}
 
Example #8
Source File: CollabFilterCassandraDriver.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 5 votes vote down vote up
double trainAndValidate(int version) throws InstantiationException, IllegalAccessException, ClassNotFoundException {
	final ICollabFilterCassandra cfc;
	String className = "collabfilter.CollabFilterCassandra" + version;
	cfc = (ICollabFilterCassandra) Class.forName(className).newInstance();
	try (Session session = this.cassandraConnector.openSession()) {
		MatrixFactorizationModel model = cfc.train(this.sparkCtx, this.cassandraConnector);
		CassandraJavaRDD<CassandraRow> validationsCassRdd = javaFunctions(this.sparkCtx).cassandraTable(RatingDO.EMPLOYERRATINGS_KEYSPACE, RatingDO.VALIDATION_TABLE);
		JavaRDD<Rating> predictionJavaRdd = cfc.predict(model, validationsCassRdd);
		double rmse = cfc.validate(predictionJavaRdd, validationsCassRdd);
		System.out.println(cfc.resultsReport(predictionJavaRdd, validationsCassRdd, rmse));
		return rmse;
	}

}
 
Example #9
Source File: ALSUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int features = (Integer) hyperParameters.get(0);
  double lambda = (Double) hyperParameters.get(1);
  double alpha = (Double) hyperParameters.get(2);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = (Double) hyperParameters.get(3);
  }
  Preconditions.checkArgument(features > 0);
  Preconditions.checkArgument(lambda >= 0.0);
  Preconditions.checkArgument(alpha > 0.0);
  if (logStrength) {
    Preconditions.checkArgument(epsilon > 0.0);
  }

  JavaRDD<String[]> parsedRDD = trainData.map(MLFunctions.PARSE_FN);
  parsedRDD.cache();

  Map<String,Integer> userIDIndexMap = buildIDIndexMapping(parsedRDD, true);
  Map<String,Integer> itemIDIndexMap = buildIDIndexMapping(parsedRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDIndexMap.size(), itemIDIndexMap.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDIndexMap);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDIndexMap);

  JavaRDD<Rating> trainRatingData = parsedToRatingRDD(parsedRDD, bUserIDToIndex, bItemIDToIndex);
  trainRatingData = aggregateScores(trainRatingData, epsilon);
  ALS als = new ALS()
      .setRank(features)
      .setIterations(iterations)
      .setLambda(lambda)
      .setCheckpointInterval(5);
  if (implicit) {
    als = als.setImplicitPrefs(true).setAlpha(alpha);
  }

  RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
  trainingRatingDataRDD.cache();
  MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
  trainingRatingDataRDD.unpersist(false);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  parsedRDD.unpersist();

  Broadcast<Map<Integer,String>> bUserIndexToID = sparkContext.broadcast(invertMap(userIDIndexMap));
  Broadcast<Map<Integer,String>> bItemIndexToID = sparkContext.broadcast(invertMap(itemIDIndexMap));

  PMML pmml = mfModelToPMML(model,
                            features,
                            lambda,
                            alpha,
                            epsilon,
                            implicit,
                            logStrength,
                            candidatePath,
                            bUserIndexToID,
                            bItemIndexToID);
  unpersist(model);

  bUserIndexToID.unpersist();
  bItemIndexToID.unpersist();

  return pmml;
}
 
Example #10
Source File: ALSUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public double evaluate(JavaSparkContext sparkContext,
                       PMML model,
                       Path modelParentPath,
                       JavaRDD<String> testData,
                       JavaRDD<String> trainData) {

  JavaRDD<String[]> parsedTestRDD = testData.map(MLFunctions.PARSE_FN);
  parsedTestRDD.cache();

  Map<String,Integer> userIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, true);
  Map<String,Integer> itemIDToIndex = buildIDIndexOneWayMap(model, parsedTestRDD, false);

  log.info("Broadcasting ID-index mappings for {} users, {} items",
           userIDToIndex.size(), itemIDToIndex.size());

  Broadcast<Map<String,Integer>> bUserIDToIndex = sparkContext.broadcast(userIDToIndex);
  Broadcast<Map<String,Integer>> bItemIDToIndex = sparkContext.broadcast(itemIDToIndex);

  JavaRDD<Rating> testRatingData = parsedToRatingRDD(parsedTestRDD, bUserIDToIndex, bItemIDToIndex);
  double epsilon = Double.NaN;
  if (logStrength) {
    epsilon = Double.parseDouble(AppPMMLUtils.getExtensionValue(model, "epsilon"));
  }
  testRatingData = aggregateScores(testRatingData, epsilon);

  MatrixFactorizationModel mfModel =
      pmmlToMFModel(sparkContext, model, modelParentPath, bUserIDToIndex, bItemIDToIndex);

  parsedTestRDD.unpersist();

  double eval;
  if (implicit) {
    double auc = Evaluation.areaUnderCurve(sparkContext, mfModel, testRatingData);
    log.info("AUC: {}", auc);
    eval = auc;
  } else {
    double rmse = Evaluation.rmse(mfModel, testRatingData);
    log.info("RMSE: {}", rmse);
    eval = -rmse;
  }
  unpersist(mfModel);

  bUserIDToIndex.unpersist();
  bItemIDToIndex.unpersist();

  return eval;
}
 
Example #11
Source File: ALSUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
/**
 * There is no actual serialization of a massive factored matrix model into PMML.
 * Instead, we create an ad-hoc serialization where the model just contains pointers
 * to files that contain the matrix data, as Extensions.
 */
private static PMML mfModelToPMML(MatrixFactorizationModel model,
                                  int features,
                                  double lambda,
                                  double alpha,
                                  double epsilon,
                                  boolean implicit,
                                  boolean logStrength,
                                  Path candidatePath,
                                  Broadcast<Map<Integer,String>> bUserIndexToID,
                                  Broadcast<Map<Integer,String>> bItemIndexToID) {

  Function<double[],float[]> doubleArrayToFloats = d -> {
    float[] f = new float[d.length];
    for (int i = 0; i < f.length; i++) {
      f[i] = (float) d[i];
    }
    return f;
  };

  JavaPairRDD<Integer,float[]> userFeaturesRDD =
      massageToIntKey(model.userFeatures()).mapValues(doubleArrayToFloats);
  JavaPairRDD<Integer,float[]> itemFeaturesRDD =
      massageToIntKey(model.productFeatures()).mapValues(doubleArrayToFloats);

  saveFeaturesRDD(userFeaturesRDD, new Path(candidatePath, "X"), bUserIndexToID);
  saveFeaturesRDD(itemFeaturesRDD, new Path(candidatePath, "Y"), bItemIndexToID);

  PMML pmml = PMMLUtils.buildSkeletonPMML();
  AppPMMLUtils.addExtension(pmml, "X", "X/");
  AppPMMLUtils.addExtension(pmml, "Y", "Y/");
  AppPMMLUtils.addExtension(pmml, "features", features);
  AppPMMLUtils.addExtension(pmml, "lambda", lambda);
  AppPMMLUtils.addExtension(pmml, "implicit", implicit);
  if (implicit) {
    AppPMMLUtils.addExtension(pmml, "alpha", alpha);
  }
  AppPMMLUtils.addExtension(pmml, "logStrength", logStrength);
  if (logStrength) {
    AppPMMLUtils.addExtension(pmml, "epsilon", epsilon);
  }
  addIDsExtension(pmml, "XIDs", userFeaturesRDD, bUserIndexToID.value());
  addIDsExtension(pmml, "YIDs", itemFeaturesRDD, bItemIndexToID.value());
  return pmml;
}
 
Example #12
Source File: CollabFilterCassandra8.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 4 votes vote down vote up
public MatrixFactorizationModel train(JavaSparkContext sparkCtx, CassandraConnector cassandraConnector) {
	CassandraJavaRDD<CassandraRow> trainingRdd = javaFunctions(sparkCtx).cassandraTable(RatingDO.EMPLOYERRATINGS_KEYSPACE, RatingDO.RATINGS_TABLE);
	JavaRDD<Rating> trainingJavaRdd = trainingRdd.map(trainingRow -> new Rating(trainingRow.getInt(RatingDO.USER_COL), trainingRow.getInt(RatingDO.PRODUCT_COL), trainingRow.getDouble(RatingDO.RATING_COL)));
	MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(trainingJavaRdd), RANK, ITER, LAMBDA);
	return model;
}
 
Example #13
Source File: CollabFilterCassandra8.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 4 votes vote down vote up
public JavaRDD<Rating> predict(MatrixFactorizationModel model, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	RDD<Tuple2<Object, Object>> validationsRdd = JavaRDD.toRDD(validationsCassRdd.map(validationRow -> new Tuple2<Object, Object>(validationRow.getInt(RatingDO.USER_COL), validationRow.getInt(RatingDO.PRODUCT_COL))));
	JavaRDD<Rating> predictionJavaRdd = model.predict(validationsRdd).toJavaRDD();
	return predictionJavaRdd;
}
 
Example #14
Source File: ALSUpdate.java    From oryx with Apache License 2.0 2 votes vote down vote up
/**
 * Manually unpersists the RDDs that are persisted inside a model.
 *
 * @param model model whose RDDs were persisted
 */
private static void unpersist(MatrixFactorizationModel model) {
  model.userFeatures().unpersist(false);
  model.productFeatures().unpersist(false);
}
 
Example #15
Source File: ICollabFilterCassandra.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 votes vote down vote up
MatrixFactorizationModel train(JavaSparkContext sparkCtx, CassandraConnector cassandraConnector); 
Example #16
Source File: ICollabFilterCassandra.java    From Spark-Cassandra-Collabfiltering with Apache License 2.0 votes vote down vote up
JavaRDD<Rating> predict(MatrixFactorizationModel model, CassandraJavaRDD<CassandraRow> validationsCassRdd);