Java Code Examples for org.apache.spark.sql.types.Metadata#empty()

The following examples show how to use org.apache.spark.sql.types.Metadata#empty() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestRangeRowRule.java From envelope with Apache License 2.0

6 votes

@Test
public void testIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  configMap.put(RangeRowRule.IGNORE_NULLS_CONFIG, true);
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertTrue("Row should pass rule", rule.check(row1));
}

Example 2

Source File: TestRangeRowRule.java From envelope with Apache License 2.0

6 votes

@Test
public void testRangeDataTypes() throws Exception {
  Config config = ConfigUtils.configFromResource("/dq/dq-range-rules.conf").getConfig("steps");
  StructType schema = new StructType(new StructField[] {
    new StructField("fa", DataTypes.LongType, false, Metadata.empty()),
    new StructField("fi", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("fl", DataTypes.LongType, false, Metadata.empty()),
    new StructField("ff", DataTypes.FloatType, false, Metadata.empty()),
    new StructField("fe", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("fd", DataTypes.createDecimalType(), false, Metadata.empty())
  });
  Row row = new RowWithSchema(schema, new Long(2), 2, new Long(2), new Float(2.0), 2.0, new BigDecimal("2.0"));
    
  ConfigObject rro =  config.getObject("dq1.deriver.rules") ;
  for ( String rulename : rro.keySet() ) {
    Config rrc = rro.toConfig().getConfig(rulename);
    RangeRowRule rrr = new RangeRowRule() ;
    rrr.configure(rrc);
    rrr.configureName(rulename);
    assertTrue("Row should pass rule " + rulename, rrr.check(row));
  }
}

Example 3

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Tags a distributed list of sentences and writes the result to an output file with 
 * a desired output format.
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) {
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(sentences, schema);
	tag(input, outputFileName, outputFormat);
}

Example 4

Source File: JavaAFTSurvivalRegressionExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaAFTSurvivalRegressionExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)),
    RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)),
    RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)),
    RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)),
    RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> training = spark.createDataFrame(data, schema);
  double[] quantileProbabilities = new double[]{0.3, 0.6};
  AFTSurvivalRegression aft = new AFTSurvivalRegression()
    .setQuantileProbabilities(quantileProbabilities)
    .setQuantilesCol("quantiles");

  AFTSurvivalRegressionModel model = aft.fit(training);

  // Print the coefficients, intercept and scale parameter for AFT survival regression
  System.out.println("Coefficients: " + model.coefficients());
  System.out.println("Intercept: " + model.intercept());
  System.out.println("Scale: " + model.scale());
  model.transform(training).show(false);
  // $example off$

  spark.stop();
}

Example 5

Source File: Tagger.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Tags a list of sequences and writes the result to an output file with a
 * desired output format.
 * 
 * @param sentences
 * @param outputFileName
 * @param outputFormat
 */
public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	tag(input, outputFileName, outputFormat);
}

Example 6

Source File: JavaTfIdfExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTfIdfExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0.0, "Hi I heard about Spark"),
    RowFactory.create(0.0, "I wish Java could use case classes"),
    RowFactory.create(1.0, "Logistic regression models are neat")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
  Dataset<Row> wordsData = tokenizer.transform(sentenceData);

  int numFeatures = 20;
  HashingTF hashingTF = new HashingTF()
    .setInputCol("words")
    .setOutputCol("rawFeatures")
    .setNumFeatures(numFeatures);

  Dataset<Row> featurizedData = hashingTF.transform(wordsData);
  // alternatively, CountVectorizer can also be used to get term frequency vectors

  IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
  IDFModel idfModel = idf.fit(featurizedData);

  Dataset<Row> rescaledData = idfModel.transform(featurizedData);
  rescaledData.select("label", "features").show();
  // $example off$

  spark.stop();
}

Example 7

Source File: JavaNormalizerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNormalizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  // Normalize each Vector using $L^1$ norm.
  Normalizer normalizer = new Normalizer()
    .setInputCol("features")
    .setOutputCol("normFeatures")
    .setP(1.0);

  Dataset<Row> l1NormData = normalizer.transform(dataFrame);
  l1NormData.show();

  // Normalize each Vector using $L^\infty$ norm.
  Dataset<Row> lInfNormData =
    normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
  lInfNormData.show();
  // $example off$

  spark.stop();
}

Example 8

Source File: SimplePredictionFromTextFile.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Simple prediction from Text File").master("local").getOrCreate();

  spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());

  String filename = "data/tuple-data-file.csv";
  StructType schema = new StructType(
      new StructField[] { new StructField("_c0", DataTypes.DoubleType, false,
          Metadata.empty()),
          new StructField("_c1", DataTypes.DoubleType, false, Metadata
              .empty()),
          new StructField("features", new VectorUDT(), true, Metadata
              .empty()), });

  Dataset<Row> df = spark.read().format("csv").schema(schema).option("header",
      "false")
      .load(filename);
  df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
  df = df.withColumn("label", df.col("_c1")).drop("_c1");
  df.printSchema();

  df = df.withColumn("features", callUDF("vectorBuilder", df.col(
      "valuefeatures")));
  df.printSchema();
  df.show();

  LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1);

  // Fit the model to the data.
  LinearRegressionModel model = lr.fit(df);

  // Given a dataset, predict each point's label, and show the results.
  model.transform(df).show();

  LinearRegressionTrainingSummary trainingSummary = model.summary();
  System.out.println("numIterations: " + trainingSummary.totalIterations());
  System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary
      .objectiveHistory()));
  trainingSummary.residuals().show();
  System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
  System.out.println("r2: " + trainingSummary.r2());

  double intercept = model.intercept();
  System.out.println("Interesection: " + intercept);
  double regParam = model.getRegParam();
  System.out.println("Regression parameter: " + regParam);
  double tol = model.getTol();
  System.out.println("Tol: " + tol);
  Double feature = 7.0;
  Vector features = Vectors.dense(feature);
  double p = model.predict(features);

  System.out.println("Prediction for feature " + feature + " is " + p);
  System.out.println(8 * regParam + intercept);
}

Example 9

Source File: FirstPrediction.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("First Prediction")
      .master("local").getOrCreate();

  StructType schema = new StructType(
      new StructField[] { new StructField("label", DataTypes.DoubleType,
          false, Metadata.empty()),
          new StructField("features", new VectorUDT(), false, Metadata
              .empty()), });

  // TODO this example is not working yet
}

Example 10

Source File: EntitySalienceFeatureExtractorSpark.java From ambiverse-nlu with Apache License 2.0

5 votes

/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}

Example 11

Source File: VectorBinarizerBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testVectorBinarizerDense() {
    // prepare data

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{-2d, -3d, -4d, -1d, 6d, -7d, 8d, 0d, 0d, 0d, 0d, 0d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{4d, -5d, 6d, 7d, -8d, 9d, -10d, 0d, 0d, 0d, 0d, 0d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{-5d, 6d, -8d, 9d, 10d, 11d, 12d, 0d, 0d, 0d, 0d, 0d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorBinarizer vectorBinarizer = new VectorBinarizer()
            .setInputCol("vector1")
            .setOutputCol("binarized")
            .setThreshold(2d);


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorBinarizer, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorBinarizer.transform(df).orderBy("id").select("id", "value1", "vector1", "binarized").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorBinarizer.getInputCol(), ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorBinarizer.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}

Example 12

Source File: JavaPolynomialExpansionExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPolynomialExpansionExample")
    .getOrCreate();

  // $example on$
  PolynomialExpansion polyExpansion = new PolynomialExpansion()
    .setInputCol("features")
    .setOutputCol("polyFeatures")
    .setDegree(3);

  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(2.0, 1.0)),
    RowFactory.create(Vectors.dense(0.0, 0.0)),
    RowFactory.create(Vectors.dense(3.0, -1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  Dataset<Row> polyDF = polyExpansion.transform(df);
  polyDF.show(false);
  // $example off$

  spark.stop();
}

Example 13

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

Example 14

Source File: CMMModel.java From vn.vitk with GNU General Public License v3.0

5 votes

@Override
public DataFrame transform(DataFrame dataset) {
	JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction());
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
		new StructField("prediction", DataTypes.StringType, false, Metadata.empty())
	});
	return dataset.sqlContext().createDataFrame(output, schema);
}

Example 15

Source File: AverageUDAF.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

@Override
public StructType inputSchema() {
	return new StructType(new StructField[] { new StructField("counter", DataTypes.DoubleType, true, Metadata.empty())});
}

Example 16

Source File: StringSanitizerBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testStringSanitizer() {

	//prepare data
	JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
			RowFactory.create(1, "Jyoti complex near Sananda clothes store; English Bazar; Malda;WB;India,"),
			RowFactory.create(2, "hallalli vinayaka tent road c/o B K vishwanath Mandya"),
			RowFactory.create(3, "M.sathish S/o devudu Lakshmi opticals Gokavaram bus stand Rajhamundry 9494954476")
	));

	StructType schema = new StructType(new StructField[]{
			new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
			new StructField("rawText", DataTypes.StringType, false, Metadata.empty())
	});
	Dataset<Row> dataset = spark.createDataFrame(rdd, schema);
	dataset.show();

	//train model in spark
	StringSanitizer sparkModel = new StringSanitizer()
			.setInputCol("rawText")
			.setOutputCol("token");

	//Export this model
	byte[] exportedModel = ModelExporter.export(sparkModel);

	//Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> pairs = sparkModel.transform(dataset).select("rawText", "token").collectAsList();

	for (Row row : pairs) {
		Map<String, Object> data = new HashMap<String, Object>();
		data.put(sparkModel.getInputCol(), row.getString(0));
		transformer.transform(data);

		String[] actual = (String[]) data.get(sparkModel.getOutputCol());

		List<String> actualList = Arrays.asList(actual);
		List<String> expected = row.getList(1);

		assertTrue("both should be same", actualList.equals(expected));
	}
}

Example 17

Source File: AttributeReference.java From indexr with Apache License 2.0

4 votes

public AttributeReference(String name, DataType dataType) {
    this.name = name;
    this.dataType = dataType;
    this.metadata = Metadata.empty;
    this.exprId = NamedExpression.newExprId();
}

Example 18

Source File: JavaIndexToStringExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaIndexToStringExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
      "to indexed column '" + indexer.getOutputCol() + "'");
  indexed.show();

  StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
  System.out.println("StringIndexer will store labels in output column metadata: " +
      Attribute.fromStructField(inputColSchema).toString() + "\n");

  IndexToString converter = new IndexToString()
    .setInputCol("categoryIndex")
    .setOutputCol("originalCategory");
  Dataset<Row> converted = converter.transform(indexed);

  System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
      "original string column '" + converter.getOutputCol() + "' using labels in metadata");
  converted.select("id", "categoryIndex", "originalCategory").show();

  // $example off$
  spark.stop();
}

Example 19

Source File: VectorAssemblerBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testVectorAssembler() {
    // prepare data

    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})),
            RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})),
            RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);
    VectorAssembler vectorAssembler = new VectorAssembler()
            .setInputCols(new String[]{"value1", "vector1"})
            .setOutputCol("feature");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorAssembler);

    String exportedModelJson = new String(exportedModel);
    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    List<Row> sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collectAsList();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorAssembler.getInputCols()[0], row.get(1));
        data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorAssembler.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}

Example 20

Source File: JavaBucketedRandomProjectionLSHExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketedRandomProjectionLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> dataA = Arrays.asList(
    RowFactory.create(0, Vectors.dense(1.0, 1.0)),
    RowFactory.create(1, Vectors.dense(1.0, -1.0)),
    RowFactory.create(2, Vectors.dense(-1.0, -1.0)),
    RowFactory.create(3, Vectors.dense(-1.0, 1.0))
  );

  List<Row> dataB = Arrays.asList(
      RowFactory.create(4, Vectors.dense(1.0, 0.0)),
      RowFactory.create(5, Vectors.dense(-1.0, 0.0)),
      RowFactory.create(6, Vectors.dense(0.0, 1.0)),
      RowFactory.create(7, Vectors.dense(0.0, -1.0))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dfA = spark.createDataFrame(dataA, schema);
  Dataset<Row> dfB = spark.createDataFrame(dataB, schema);

  Vector key = Vectors.dense(1.0, 0.0);

  BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH()
    .setBucketLength(2.0)
    .setNumHashTables(3)
    .setInputCol("keys")
    .setOutputCol("values");

  BucketedRandomProjectionLSHModel model = mh.fit(dfA);

  // Feature Transformation
  model.transform(dfA).show();
  // Cache the transformed columns
  Dataset<Row> transformedA = model.transform(dfA).cache();
  Dataset<Row> transformedB = model.transform(dfB).cache();

  // Approximate similarity join
  model.approxSimilarityJoin(dfA, dfB, 1.5).show();
  model.approxSimilarityJoin(transformedA, transformedB, 1.5).show();
  // Self Join
  model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show();

  // Approximate nearest neighbor search
  model.approxNearestNeighbors(dfA, key, 2).show();
  model.approxNearestNeighbors(transformedA, key, 2).show();
  // $example off$

  spark.stop();
}