org.apache.spark.mllib.linalg.Vector Java Examples

The following examples show how to use org.apache.spark.mllib.linalg.Vector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractKMeansEvaluation.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * @param evalData points to cluster for evaluation
 * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
 *  and sum of squared distances
 */
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
  return evalData.mapToPair(vector -> {
    double closestDist = Double.POSITIVE_INFINITY;
    int minClusterID = Integer.MIN_VALUE;
    double[] vec = vector.toArray();
    for (ClusterInfo cluster : clusters.values()) {
      double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
      if (distance < closestDist) {
        closestDist = distance;
        minClusterID = cluster.getID();
      }
    }
    Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
    return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
  }).reduceByKey(ClusterMetric::add);
}
 
Example #2
Source File: IfZeroVectorBridgeTest.java    From spark-transformers with Apache License 2.0 6 votes vote down vote up
public DataFrame createDF(JavaRDD<Tuple2<Vector, String>> rdd) {

        // Generate the schema based on the string of schema
        List<StructField> fields = new ArrayList<StructField>();
        fields.add(DataTypes.createStructField("vectorized_count", new VectorUDT(), true));
        fields.add(DataTypes.createStructField("product_title", DataTypes.StringType, true));

        StructType schema = DataTypes.createStructType(fields);
        // Convert records of the RDD (people) to Rows.
        JavaRDD<Row> rowRDD = rdd.map(
                new Function<Tuple2<Vector, String>, Row>() {
                    public Row call(Tuple2<Vector, String> record) {
                        return RowFactory.create(record._1(), record._2());
                    }
                });

        return sqlContext.createDataFrame(rowRDD, schema);
    }
 
Example #3
Source File: SilhouetteCoefficient.java    From oryx with Apache License 2.0 6 votes vote down vote up
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
  return evalData.mapToPair(vector -> {
    double closestDist = Double.POSITIVE_INFINITY;
    int minClusterID = Integer.MIN_VALUE;
    double[] vec = vector.toArray();
    DistanceFn<double[]> distanceFn = getDistanceFn();
    Map<Integer,ClusterInfo> clusters = getClustersByID();
    for (ClusterInfo cluster : clusters.values()) {
      double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
      if (distance < closestDist) {
        closestDist = distance;
        minClusterID = cluster.getID();
      }
    }
    Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
    return new Tuple2<>(minClusterID, vec);
  }).groupByKey();
}
 
Example #4
Source File: KMeansHullGenerator.java    From geowave with Apache License 2.0 6 votes vote down vote up
public static JavaPairRDD<Integer, Geometry> generateHullsRDD(
    final JavaPairRDD<Integer, Iterable<Vector>> groupedPoints) {
  // Create the convex hull for each kmeans centroid
  final JavaPairRDD<Integer, Geometry> hullRDD = groupedPoints.mapValues(point -> {
    final Iterable<Coordinate> coordIt =
        Iterables.transform(point, new com.google.common.base.Function<Vector, Coordinate>() {
          @Override
          public Coordinate apply(final Vector input) {
            if (input != null) {
              return new Coordinate(input.apply(0), input.apply(1));
            }

            return new Coordinate();
          }
        });

    final Coordinate[] coordArray = Iterables.toArray(coordIt, Coordinate.class);

    return new ConvexHull(coordArray, GeometryUtils.GEOMETRY_FACTORY).getConvexHull();
  });

  return hullRDD;
}
 
Example #5
Source File: JavaPCAExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("PCA Example");
  SparkContext sc = new SparkContext(conf);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 principal components.
  Matrix pc = mat.computePrincipalComponents(3);
  RowMatrix projected = mat.multiply(pc);
  // $example off$
  Vector[] collectPartitions = (Vector[])projected.rows().collect();
  System.out.println("Projected vector of principal component:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
}
 
Example #6
Source File: JavaSummaryStatisticsExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaSummaryStatisticsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaRDD<Vector> mat = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    ); // an RDD of Vectors

    // Compute column summary statistics.
    MultivariateStatisticalSummary summary = Statistics.colStats(mat.rdd());
    System.out.println(summary.mean());  // a dense vector containing the mean value for each column
    System.out.println(summary.variance());  // column-wise variance
    System.out.println(summary.numNonzeros());  // number of nonzeros in each column
    // $example off$

    jsc.stop();
  }
 
Example #7
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #8
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create an Apache Commons RealMatrix from a Spark RowMatrix.
 *
 * @param r Never {@code null}
 * @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming.  Provide the value here, if it is already known.
 *                      Use {@code -1} if unknown.
 * @return Never {@code null}
 */
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {

    Utils.nonNull(r, "Input row matrix cannot be null");

    int numRows;
    if (cachedNumRows == -1) {
        // This takes a while in Spark
        numRows = (int) r.numRows();
    } else {
        numRows = cachedNumRows;
    }

    final int numCols = (int) r.numCols();

    // This cast is required, even though it would not seem necessary, at first.  Exact reason why is unknown.
    //   Will fail compilation if the cast is removed.
    final Vector [] rowVectors = (Vector []) r.rows().collect();

    final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
    for (int i = 0; i < numRows; i++) {
        result.setRow(i, rowVectors[i].toArray() );
    }
    return result;
}
 
Example #9
Source File: Model.java    From DDF with Apache License 2.0 6 votes vote down vote up
@Override
public Double predict(double[] point) throws DDFException {
  MLClassMethods.PredictMethod predictMethod= new MLClassMethods.PredictMethod(this.getRawModel(), MLClassMethods.DEFAULT_PREDICT_METHOD_NAME,
    new Class<?>[]{Vector.class});
  if(predictMethod.getMethod() == null) {
    throw new DDFException(String.format("Cannot locate method specified by %s", MLClassMethods.DEFAULT_PREDICT_METHOD_NAME));

  }
  Object prediction = predictMethod.instanceInvoke(Vectors.dense(point));
  if(prediction instanceof Double) {
    return (Double) prediction;
  } else if (prediction instanceof Integer) {
    return ((Integer) prediction).doubleValue();
  } else {
    throw new DDFException(String.format("Error getting prediction from model %s", this.getRawModel().getClass().getName()));
  }
}
 
Example #10
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example #11
Source File: SparkConverter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create an Apache Commons RealMatrix from a Spark RowMatrix.
 *
 * @param r Never {@code null}
 * @param cachedNumRows Checking the number of rows in {@code r} can be time-consuming.  Provide the value here, if it is already known.
 *                      Use {@code -1} if unknown.
 * @return Never {@code null}
 */
public static RealMatrix convertSparkRowMatrixToRealMatrix(final RowMatrix r, final int cachedNumRows) {

    Utils.nonNull(r, "Input row matrix cannot be null");

    int numRows;
    if (cachedNumRows == -1) {
        // This takes a while in Spark
        numRows = (int) r.numRows();
    } else {
        numRows = cachedNumRows;
    }

    final int numCols = (int) r.numCols();

    // This cast is required, even though it would not seem necessary, at first.  Exact reason why is unknown.
    //   Will fail compilation if the cast is removed.
    final Vector [] rowVectors = (Vector []) r.rows().collect();

    final RealMatrix result = new Array2DRowRealMatrix(numRows, numCols);
    for (int i = 0; i < numRows; i++) {
        result.setRow(i, rowVectors[i].toArray() );
    }
    return result;
}
 
Example #12
Source File: DunnIndex.java    From oryx with Apache License 2.0 6 votes vote down vote up
/**
 * @param evalData data for evaluation
 * @return the Dunn Index of a given clustering
 *  (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better
 */
@Override
double evaluate(JavaRDD<Vector> evalData) {
  // Intra-cluster distance is mean distance to centroid
  double maxIntraClusterDistance =
      fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max();

  // Inter-cluster distance is distance between centroids
  double minInterClusterDistance = Double.POSITIVE_INFINITY;
  List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values());
  DistanceFn<double[]> distanceFn = getDistanceFn();
  for (int i = 0; i < clusters.size(); i++) {
    double[] centerI = clusters.get(i).getCenter();
    // Distances are symmetric, hence d(i,j) == d(j,i)
    for (int j = i + 1; j < clusters.size(); j++) {
      double[] centerJ = clusters.get(j).getCenter();
      minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ));
    }
  }

  return minInterClusterDistance / maxIntraClusterDistance;
}
 
Example #13
Source File: FeatureValueInstanceUtils.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public static Vector convertToSparkMLVector(FeatureValueInstance fvi, int vectorSize) {
  Map<Integer, Double> featureValues = fvi.getFeatureValues();
  List<Tuple2<Integer, Double>> sortedFeatureValues =
      featureValues.entrySet().stream()
          .sorted((o1, o2) -> Integer.compare(o1.getKey(), o2.getKey()))
          .map(o -> new Tuple2<>(o.getKey(), o.getValue()))
          .collect(Collectors.toList());

  int[] features = new int[sortedFeatureValues.size()];
  double[] values = new double[sortedFeatureValues.size()];

  int i = 0;
  for (Tuple2<Integer, Double> fv : sortedFeatureValues) {
    features[i] = fv._1();
    values[i] = fv._2();
    ++i;
  }

  Vector v = Vectors.sparse(vectorSize, features, values);
  return v;
}
 
Example #14
Source File: JavaSVDExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SVD Example");
  SparkContext sc = new SparkContext(conf);
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);

  // $example on$
  double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
  LinkedList<Vector> rowsList = new LinkedList<>();
  for (int i = 0; i < array.length; i++) {
    Vector currentRow = Vectors.dense(array[i]);
    rowsList.add(currentRow);
  }
  JavaRDD<Vector> rows = jsc.parallelize(rowsList);

  // Create a RowMatrix from JavaRDD<Vector>.
  RowMatrix mat = new RowMatrix(rows.rdd());

  // Compute the top 3 singular values and corresponding singular vectors.
  SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
  RowMatrix U = svd.U();
  Vector s = svd.s();
  Matrix V = svd.V();
  // $example off$
  Vector[] collectPartitions = (Vector[]) U.rows().collect();
  System.out.println("U factor is:");
  for (Vector vector : collectPartitions) {
    System.out.println("\t" + vector);
  }
  System.out.println("Singular values are: " + s);
  System.out.println("V factor is:\n" + V);

  jsc.stop();
}
 
Example #15
Source File: MLLibUtil.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert an ndarray to a vector
 * @param arr the array
 * @return an mllib vector
 */
public static Vector toVector(INDArray arr) {
    if (!arr.isVector()) {
        throw new IllegalArgumentException("passed in array must be a vector");
    }
    if (arr.length() > Integer.MAX_VALUE)
        throw new ND4JArraySizeException();
    double[] ret = new double[(int) arr.length()];
    for (int i = 0; i < arr.length(); i++) {
        ret[i] = arr.getDouble(i);
    }

    return Vectors.dense(ret);
}
 
Example #16
Source File: KMeansHullGenerator.java    From geowave with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Integer, Iterable<Vector>> groupByIndex(
    final JavaRDD<Vector> inputPoints,
    final KMeansModel clusterModel) {
  // Group the input points by their kmeans centroid index
  return inputPoints.groupBy(point -> {
    return clusterModel.predict(point);
  });
}
 
Example #17
Source File: LogisticRegressionBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #18
Source File: JavaBisectingKMeansExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaBisectingKMeansExample");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);

  // $example on$
  ArrayList<Vector> localData = Lists.newArrayList(
    Vectors.dense(0.1, 0.1),   Vectors.dense(0.3, 0.3),
    Vectors.dense(10.1, 10.1), Vectors.dense(10.3, 10.3),
    Vectors.dense(20.1, 20.1), Vectors.dense(20.3, 20.3),
    Vectors.dense(30.1, 30.1), Vectors.dense(30.3, 30.3)
  );
  JavaRDD<Vector> data = sc.parallelize(localData, 2);

  BisectingKMeans bkm = new BisectingKMeans()
    .setK(4);
  BisectingKMeansModel model = bkm.run(data);

  System.out.println("Compute Cost: " + model.computeCost(data));

  Vector[] clusterCenters = model.clusterCenters();
  for (int i = 0; i < clusterCenters.length; i++) {
    Vector clusterCenter = clusterCenters[i];
    System.out.println("Cluster Center " + i + ": " + clusterCenter);
  }
  // $example off$

  sc.stop();
}
 
Example #19
Source File: DecisionTreeRegressionModelBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testDecisionTreeRegression() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeRegressionModel regressionModel = new DecisionTreeRegressor()
            .setFeaturesCol("features").fit(trainingData);

    byte[] exportedModel = ModelExporter.export(regressionModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = regressionModel.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        System.out.println(actual + ", " + predicted);
        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #20
Source File: JavaGaussianMixtureExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/gmm_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using GaussianMixture
    GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd());

    // Save and load GaussianMixtureModel
    gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel");
    GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(),
      "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel");

    // Output the parameters of the mixture model
    for (int j = 0; j < gmm.k(); j++) {
      System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
        gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma());
    }
    // $example off$

    jsc.stop();
  }
 
Example #21
Source File: IfZeroVectorBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testModelExportAndImportDense() {
    IfZeroVector sparkModel = new IfZeroVector()
            .setInputCol("vectorized_count")
            .setOutputCol("product_title_filtered")
            .setThenSetValue("others")
            .setElseSetCol("product_title");
    byte[] exportedModel = ModelExporter.export(sparkModel, denseOrderDF);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] denseOrder = denseOrderDF.collect();

    for (int i = 0; i < denseOrder.length; i++) {
        double[] input = ((Vector) denseOrder[i].get(0)).toArray();
        System.out.println("Input double array from dense = " + Arrays.toString(input));
        String colValue = ((String) denseOrder[i].get(1));

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("vectorized_count", input);
        data.put("product_title", colValue);

        transformer.transform(data);
        double[] tInput = (double[]) data.get("vectorized_count");
        String tColValue = (String) data.get("product_title");
        String output = (String) data.get("product_title_filtered");
        assertEquals(input, tInput);
        assertEquals(colValue, tColValue);
        String expectedOutput = (i == 0) ? "others" : colValue;
        System.out.println(output);
        assertEquals(expectedOutput, output);
    }
}
 
Example #22
Source File: IfZeroVectorBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testModelExportAndImportSparse() {
    IfZeroVector sparkModel = new IfZeroVector()
            .setInputCol("vectorized_count")
            .setOutputCol("product_title_filtered")
            .setThenSetValue("others")
            .setElseSetCol("product_title");
    byte[] exportedModel = ModelExporter.export(sparkModel, sparseOrderDF);
    final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparseOrder = sparseOrderDF.collect();

    for (int i = 0; i < sparseOrder.length; i++) {
        double[] input = ((Vector) sparseOrder[i].get(0)).toArray();
        System.out.println("Input double array from sparse = " + Arrays.toString(input));
        String colValue = ((String) sparseOrder[i].get(1));

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("vectorized_count", input);
        data.put("product_title", colValue);

        transformer.transform(data);
        double[] tInput = (double[]) data.get("vectorized_count");
        String tColValue = (String) data.get("product_title");
        String output = (String) data.get("product_title_filtered");
        assertEquals(input, tInput);
        assertEquals(colValue, tColValue);
        String expectedOutput = (i == 0) ? "others" : colValue;
        System.out.println(output);
        assertEquals(expectedOutput, output);
    }
}
 
Example #23
Source File: LogisticRegression1BridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #24
Source File: SilhouetteCoefficient.java    From oryx with Apache License 2.0 5 votes vote down vote up
static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
  long count = evalData.count();
  if (count > MAX_SAMPLE_SIZE) {
    return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
  }
  return evalData;
}
 
Example #25
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) {
  return parsedRDD.map(data -> {
    try {
      return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(data));
      throw e;
    }
  });
}
 
Example #26
Source File: StandardScalerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
private void assertCorrectness(Row[] sparkOutput, double[][] expected, Transformer transformer) {
    for (int i = 0; i < 2; i++) {
        double[] input = ((Vector) sparkOutput[i].get(0)).toArray();

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", input);
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get("scaledOutput");

        double[] sparkOp = ((Vector) sparkOutput[i].get(1)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
        assertArrayEquals(transformedOp, expected[i], EPSILON);
    }
}
 
Example #27
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private ClusteringModel pmmlClusteringModel(KMeansModel model,
                                            Map<Integer,Long> clusterSizesMap) {
  Vector[] clusterCenters = model.clusterCenters();

  List<ClusteringField> clusteringFields = new ArrayList<>();
  for (int i = 0; i < inputSchema.getNumFeatures(); i++) {
    if (inputSchema.isActive(i)) {
      FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i));
      ClusteringField clusteringField =
          new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE);
      clusteringFields.add(clusteringField);
    }
  }

  List<Cluster> clusters = new ArrayList<>(clusterCenters.length);
  for (int i = 0; i < clusterCenters.length; i++) {
    clusters.add(new Cluster().setId(Integer.toString(i))
                     .setSize(clusterSizesMap.get(i).intValue())
                     .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray())));
  }

  return new ClusteringModel(
      MiningFunction.CLUSTERING,
      ClusteringModel.ModelClass.CENTER_BASED,
      clusters.size(),
      AppPMMLUtils.buildMiningSchema(inputSchema),
      new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE, new SquaredEuclidean()),
      clusteringFields,
      clusters);
}
 
Example #28
Source File: MLSupporter.java    From DDF with Apache License 2.0 5 votes vote down vote up
/**
 * Override this to return the approriate DDF representation matching that specified in {@link ParamInfo}. The base
 * implementation simply returns the DDF.
 *
 * @param paramInfo
 * @return
 */
@SuppressWarnings("unchecked")
@Override
protected Object convertDDF(ParamInfo paramInfo) throws DDFException {
  mLog.info(">>>> Running ConvertDDF of io.ddf.spark.ml.MLSupporter");
  if (paramInfo.argMatches(RDD.class)) {
    // Yay, our target data format is an RDD!
    RDD<?> rdd = null;

    if (paramInfo.paramMatches(LabeledPoint.class)) {
      rdd = (RDD<LabeledPoint>) this.getDDF().getRepresentationHandler().get(RDD.class, LabeledPoint.class);

    } else if (paramInfo.paramMatches(Vector.class)) {
      rdd = (RDD<Vector>) this.getDDF().getRepresentationHandler().get(RDD.class, Vector.class);
    } else if (paramInfo.paramMatches(double[].class)) {
      rdd = (RDD<double[]>) this.getDDF().getRepresentationHandler().get(RDD.class, double[].class);
    } else if (paramInfo.paramMatches(io.ddf.types.Vector.class)) {
      rdd = (RDD<io.ddf.types.Vector>) this.getDDF().getRepresentationHandler()
          .get(RDD.class, io.ddf.types.Vector.class);
    } else if (paramInfo.paramMatches(TupleMatrixVector.class)) {
      rdd = (RDD<TupleMatrixVector>) this.getDDF().getRepresentationHandler().get(RDD.class, TupleMatrixVector.class);
    } else if (paramInfo.paramMatches(Rating.class)) {
      rdd = (RDD<Rating>) this.getDDF().getRepresentationHandler().get(RDD.class, Rating.class);
    }
    //      else if (paramInfo.paramMatches(TablePartition.class)) {
    //        rdd = (RDD<TablePartition>) this.getDDF().getRepresentationHandler().get(RDD.class, TablePartition.class);
    //      }
    else if (paramInfo.paramMatches(Object.class)) {
      rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class);
    }

    return rdd;
  } else {
    return super.convertDDF(paramInfo);
  }
}
 
Example #29
Source File: DaviesBouldinIndex.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param evalData data for evaluation
 * @return the Davies-Bouldin Index (http://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation);
 *  lower is better
 */
@Override
double evaluate(JavaRDD<Vector> evalData) {
  Map<Integer,ClusterMetric> clusterMetricsByID = fetchClusterMetrics(evalData).collectAsMap();
  Map<Integer,ClusterInfo> clustersByID = getClustersByID();
  DistanceFn<double[]> distanceFn = getDistanceFn();

  return clustersByID.entrySet().stream().mapToDouble(entryI -> {
    Integer idI = entryI.getKey();
    double[] centerI = entryI.getValue().getCenter();
    double clusterScatter1 = clusterMetricsByID.get(idI).getMeanDist();
    // this inner loop should not be set to j = (i+1) as DB Index computation is not symmetric.
    // For a given cluster i, we look for a cluster j that maximizes
    // the ratio of (the sum of average distances from points in cluster i to its center and
    // points in cluster j to its center) to (the distance between cluster i and cluster j).
    // The key here is the Maximization of the DB Index for a cluster:
    // the cluster that maximizes this ratio may be j for i but not necessarily i for j
    return clustersByID.entrySet().stream().mapToDouble(entryJ -> {
      Integer idJ = entryJ.getKey();
      if (idI.equals(idJ)) {
        return 0.0;
      }
      double[] centerJ = entryJ.getValue().getCenter();
      double clusterScatter2 = clusterMetricsByID.get(idJ).getMeanDist();
      return (clusterScatter1 + clusterScatter2) / distanceFn.applyAsDouble(centerI, centerJ);
    }).max().orElse(0.0);
  }).average().orElse(0.0);
}
 
Example #30
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}