Java Code Examples for org.apache.spark.sql.types.Metadata

The following examples show how to use org.apache.spark.sql.types.Metadata. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: SparkDemo   Source File: JavaStopWordsRemoverExample.java    License: MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStopWordsRemoverExample")
    .getOrCreate();

  // $example on$
  StopWordsRemover remover = new StopWordsRemover()
    .setInputCol("raw")
    .setOutputCol("filtered");

  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField(
      "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  remover.transform(dataset).show(false);
  // $example off$
  spark.stop();
}
 
Example 2
Source Project: bunsen   Source File: DefinitionToSparkVisitor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public HapiConverter visitContained(String elementPath,
    String elementTypeUrl,
    Map<String, StructureField<HapiConverter<DataType>>> contained) {

  StructField[] fields = contained.values()
      .stream()
      .map(containedEntry -> new StructField(containedEntry.fieldName(),
          containedEntry.result().getDataType(),
          true,
          Metadata.empty()))
      .toArray(StructField[]::new);

  ArrayType container = new ArrayType(new StructType(fields), true);

  return new HapiContainedToSparkConverter(contained, container);
}
 
Example 3
Source Project: DataVec   Source File: DataFrames.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example 4
Source Project: vn.vitk   Source File: Tagger.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
	List<Row> rows = new LinkedList<Row>();
	for (String sentence : sentences) {
		rows.add(RowFactory.create(sentence));
	}
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame input = sqlContext.createDataFrame(rows, schema);
	if (cmmModel != null) {
		DataFrame output = cmmModel.transform(input).repartition(1);
		return output.javaRDD().map(new RowToStringFunction(1)).collect();
	} else {
		System.err.println("Tagging model is null. You need to create or load a model first.");
		return null;
	}
}
 
Example 5
Source Project: vn.vitk   Source File: NGramBuilder.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a n-gram data frame from text lines.
 * @param lines
 * @return a n-gram data frame.
 */
DataFrame createNGramDataFrame(JavaRDD<String> lines) {
	JavaRDD<Row> rows = lines.map(new Function<String, Row>(){
		private static final long serialVersionUID = -4332903997027358601L;
		
		@Override
		public Row call(String line) throws Exception {
			return RowFactory.create(Arrays.asList(line.split("\\s+")));
		}
	});
	StructType schema = new StructType(new StructField[] {
			new StructField("words",
					DataTypes.createArrayType(DataTypes.StringType), false,
					Metadata.empty()) });
	DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema);
	// build a bigram language model
	NGram transformer = new NGram().setInputCol("words")
			.setOutputCol("ngrams").setN(2);
	DataFrame ngramDF = transformer.transform(wordDF);
	ngramDF.show(10, false);
	return ngramDF;
}
 
Example 6
Source Project: vn.vitk   Source File: DependencyParser.java    License: GNU General Public License v3.0 6 votes vote down vote up
/**
 * Parses a list of PoS-tagged sentences, each on a line and writes the result to an output 
 * file in a specified output format.
 * @param jsc
 * @param sentences
 * @param outputFileName
 * @param outuptFormat
 */
public void parse(JavaSparkContext jsc, List<String> sentences, String outputFileName, OutputFormat outputFormat) {
	JavaRDD<String> input = jsc.parallelize(sentences);
	JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction());
	JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction());
	JavaRDD<Row> rows = graphs.map(new Function<DependencyGraph, Row>() {
		private static final long serialVersionUID = -812004521983071103L;
		public Row call(DependencyGraph graph) {
			return RowFactory.create(graph.getSentence().toString(), graph.dependencies());
		}
	});
	StructType schema = new StructType(new StructField[]{
		new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),	
		new StructField("dependency", DataTypes.StringType, false, Metadata.empty())
	});
	SQLContext sqlContext = new SQLContext(jsc);
	DataFrame df = sqlContext.createDataFrame(rows, schema);
	
	if (outputFormat == OutputFormat.TEXT)  
		df.select("dependency").write().text(outputFileName);
	else 
		df.repartition(1).write().json(outputFileName);
}
 
Example 7
Source Project: hudi   Source File: TestFlatteningTransformer.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}
 
Example 8
Source Project: envelope   Source File: TestRangeRowRule.java    License: Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example 9
Source Project: envelope   Source File: TestRangeRowRule.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRangeDataTypes() throws Exception {
  Config config = ConfigUtils.configFromResource("/dq/dq-range-rules.conf").getConfig("steps");
  StructType schema = new StructType(new StructField[] {
    new StructField("fa", DataTypes.LongType, false, Metadata.empty()),
    new StructField("fi", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("fl", DataTypes.LongType, false, Metadata.empty()),
    new StructField("ff", DataTypes.FloatType, false, Metadata.empty()),
    new StructField("fe", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("fd", DataTypes.createDecimalType(), false, Metadata.empty())
  });
  Row row = new RowWithSchema(schema, new Long(2), 2, new Long(2), new Float(2.0), 2.0, new BigDecimal("2.0"));
    
  ConfigObject rro =  config.getObject("dq1.deriver.rules") ;
  for ( String rulename : rro.keySet() ) {
    Config rrc = rro.toConfig().getConfig(rulename);
    RangeRowRule rrr = new RangeRowRule() ;
    rrr.configure(rrc);
    rrr.configureName(rulename);
    assertTrue("Row should pass rule " + rulename, rrr.check(row));
  }
}
 
Example 10
Source Project: envelope   Source File: TestDecisionStep.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneByStepValueTrue() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(true)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step3, step4, step7, step8));
}
 
Example 11
Source Project: envelope   Source File: TestDecisionStep.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneByStepValueFalse() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(false)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
 
Example 12
private StructType readSchemaImpl() {
  StructField[] fields = info.getSchema().getFields().stream()
    .map(field ->
      new StructField(field.getName(),
        sparkFromArrow(field.getFieldType()),
        field.isNullable(),
        Metadata.empty()))
    .toArray(StructField[]::new);
  return new StructType(fields);
}
 
Example 13
/**
 * Extract a DataFrame ready for training or testing.
 * @param jsc
 * @param documents
 * @param sqlContext
 * @return
 * @throws ResourceInitializationException
 */
public DataFrame extract(JavaSparkContext jsc, JavaRDD<SCAS> documents, SQLContext sqlContext) throws ResourceInitializationException {
    Accumulator<Integer> TOTAL_DOCS = jsc.accumulator(0, "TOTAL_DOCS");
    Accumulator<Integer> SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "SALIENT_ENTITY_INSTANCES");
    Accumulator<Integer> NON_SALIENT_ENTITY_INSTANCES = jsc.accumulator(0, "NON_SALIENT_ENTITY_INSTANCES");

    TrainingSettings trainingSettings = getTrainingSettings();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

    JavaRDD<TrainingInstance> trainingInstances =
            documents.flatMap(s -> {
                TOTAL_DOCS.add(1);
                return fe.getTrainingInstances(s.getJCas(),
                        trainingSettings.getFeatureExtractor(),
                        trainingSettings.getPositiveInstanceScalingFactor());
            });

    StructType schema = new StructType(new StructField[]{
            new StructField("docId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("entityId", DataTypes.StringType, false, Metadata.empty() ),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty() ),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    JavaRDD<Row> withFeatures = trainingInstances.map(ti -> {
        if (ti.getLabel() == 1.0) {
            SALIENT_ENTITY_INSTANCES.add(1);
        } else {
            NON_SALIENT_ENTITY_INSTANCES.add(1);
        }
        Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ti, featureVectorSize);
        return RowFactory.create(ti.getDocId(), ti.getEntityId(), ti.getLabel(), vei);
    });

    return sqlContext.createDataFrame(withFeatures, schema);
}
 
Example 14
Source Project: iceberg   Source File: TestSparkSchema.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFailIfSparkReadSchemaIsOff() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name
          }
      );

  AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields",
      java.lang.IllegalArgumentException.class, "Field idd not found in source schema",
      () ->
          spark.read()
              .schema(sparkReadSchema)
              .format("iceberg")
              .load(tableLocation)
  );
}
 
Example 15
Source Project: iceberg   Source File: TestSparkSchema.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
 
Example 16
Source Project: iceberg   Source File: TestSparkSchema.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  AssertHelpers.assertThrows("Spark should not allow a projection that is not included in the read schema",
      org.apache.spark.sql.AnalysisException.class, "cannot resolve '`id`' given input columns: [data]",
      () ->
          spark.read()
            .schema(sparkReadSchema)
            .format("iceberg")
            .load(tableLocation)
            .select("id")
  );
}
 
Example 17
Source Project: sylph   Source File: SQLHepler.java    License: Apache License 2.0 5 votes vote down vote up
public static StructType schemaToSparkType(Schema schema)
{
    StructField[] structFields = schema.getFields().stream().map(field ->
            StructField.apply(field.getName(), getSparkType(field.getJavaType()), true, Metadata.empty())
    ).toArray(StructField[]::new);

    StructType structType = new StructType(structFields);
    return structType;
}
 
Example 18
Source Project: mmtf-spark   Source File: QuaternaryStructureDataset.java    License: Apache License 2.0 5 votes vote down vote up
/**
* Returns a dataset with quaternary structure info
* 
* @param structure 
* @return dataset quaternary structure info
*/
  public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) {
      JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t));
      
      StructType schema = new StructType(new StructField[]{
              new StructField("structureId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
      });
      
      SparkSession spark = SparkSession.builder().getOrCreate();
      return spark.createDataFrame(rows, schema);
  }
 
Example 19
Source Project: SparkDemo   Source File: JavaBucketizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketizerExample")
    .getOrCreate();

  // $example on$
  double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

  List<Row> data = Arrays.asList(
    RowFactory.create(-999.9),
    RowFactory.create(-0.5),
    RowFactory.create(-0.3),
    RowFactory.create(0.0),
    RowFactory.create(0.2),
    RowFactory.create(999.9)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Bucketizer bucketizer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits);

  // Transform original data into its bucket index.
  Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

  System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
  bucketedData.show();
  // $example off$

  spark.stop();
}
 
Example 20
Source Project: SparkDemo   Source File: JavaMinHashLSHExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinHashLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinHashLSH mh = new MinHashLSH()
    .setNumHashTables(1)
    .setInputCol("keys")
    .setOutputCol("values");

  MinHashLSHModel model = mh.fit(dataFrame);
  model.transform(dataFrame).show();
  // $example off$

  spark.stop();
}
 
Example 21
Source Project: SparkDemo   Source File: JavaNormalizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNormalizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  // Normalize each Vector using $L^1$ norm.
  Normalizer normalizer = new Normalizer()
    .setInputCol("features")
    .setOutputCol("normFeatures")
    .setP(1.0);

  Dataset<Row> l1NormData = normalizer.transform(dataFrame);
  l1NormData.show();

  // Normalize each Vector using $L^\infty$ norm.
  Dataset<Row> lInfNormData =
    normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
  lInfNormData.show();
  // $example off$

  spark.stop();
}
 
Example 22
Source Project: SparkDemo   Source File: JavaTfIdfExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTfIdfExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0.0, "Hi I heard about Spark"),
    RowFactory.create(0.0, "I wish Java could use case classes"),
    RowFactory.create(1.0, "Logistic regression models are neat")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
  Dataset<Row> wordsData = tokenizer.transform(sentenceData);

  int numFeatures = 20;
  HashingTF hashingTF = new HashingTF()
    .setInputCol("words")
    .setOutputCol("rawFeatures")
    .setNumFeatures(numFeatures);

  Dataset<Row> featurizedData = hashingTF.transform(wordsData);
  // alternatively, CountVectorizer can also be used to get term frequency vectors

  IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
  IDFModel idfModel = idf.fit(featurizedData);

  Dataset<Row> rescaledData = idfModel.transform(featurizedData);
  rescaledData.select("label", "features").show();
  // $example off$

  spark.stop();
}
 
Example 23
Source Project: SparkDemo   Source File: JavaMinMaxScalerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinMaxScalerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
      RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinMaxScaler scaler = new MinMaxScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures");

  // Compute summary statistics and generate MinMaxScalerModel
  MinMaxScalerModel scalerModel = scaler.fit(dataFrame);

  // rescale each feature to range [min, max].
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  System.out.println("Features scaled to range: [" + scaler.getMin() + ", "
      + scaler.getMax() + "]");
  scaledData.select("features", "scaledFeatures").show();
  // $example off$

  spark.stop();
}
 
Example 24
Source Project: SparkDemo   Source File: JavaQuantileDiscretizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaQuantileDiscretizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 18.0),
    RowFactory.create(1, 19.0),
    RowFactory.create(2, 8.0),
    RowFactory.create(3, 5.0),
    RowFactory.create(4, 2.2)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("hour", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);
  // $example off$
  // Output of QuantileDiscretizer for such small datasets can depend on the number of
  // partitions. Here we force a single partition to ensure consistent results.
  // Note this is not necessary for normal use cases
  df = df.repartition(1);
  // $example on$
  QuantileDiscretizer discretizer = new QuantileDiscretizer()
    .setInputCol("hour")
    .setOutputCol("result")
    .setNumBuckets(3);

  Dataset<Row> result = discretizer.fit(df).transform(df);
  result.show();
  // $example off$
  spark.stop();
}
 
Example 25
Source Project: SparkDemo   Source File: JavaChiSqSelectorExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaChiSqSelectorExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
    RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
    RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
    new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  ChiSqSelector selector = new ChiSqSelector()
    .setNumTopFeatures(1)
    .setFeaturesCol("features")
    .setLabelCol("clicked")
    .setOutputCol("selectedFeatures");

  Dataset<Row> result = selector.fit(df).transform(df);

  System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures()
      + " features selected");
  result.show();

  // $example off$
  spark.stop();
}
 
Example 26
Source Project: SparkDemo   Source File: JavaDCTExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDCTExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
    RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
    RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  DCT dct = new DCT()
    .setInputCol("features")
    .setOutputCol("featuresDCT")
    .setInverse(false);

  Dataset<Row> dctDf = dct.transform(df);

  dctDf.select("featuresDCT").show(false);
  // $example off$

  spark.stop();
}
 
Example 27
Source Project: SparkDemo   Source File: JavaNGramExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNGramExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
    RowFactory.create(1, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
    RowFactory.create(2, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField(
      "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> wordDataFrame = spark.createDataFrame(data, schema);

  NGram ngramTransformer = new NGram().setN(2).setInputCol("words").setOutputCol("ngrams");

  Dataset<Row> ngramDataFrame = ngramTransformer.transform(wordDataFrame);
  ngramDataFrame.select("ngrams").show(false);
  // $example off$

  spark.stop();
}
 
Example 28
Source Project: SparkDemo   Source File: JavaMaxAbsScalerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMaxAbsScalerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MaxAbsScaler scaler = new MaxAbsScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures");

  // Compute summary statistics and generate MaxAbsScalerModel
  MaxAbsScalerModel scalerModel = scaler.fit(dataFrame);

  // rescale each feature to range [-1, 1].
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  scaledData.select("features", "scaledFeatures").show();
  // $example off$

  spark.stop();
}
 
Example 29
Source Project: SparkDemo   Source File: JavaBinarizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example 30
Source Project: SparkDemo   Source File: JavaPCAExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPCAExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
    RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
    RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  PCAModel pca = new PCA()
    .setInputCol("features")
    .setOutputCol("pcaFeatures")
    .setK(3)
    .fit(df);

  Dataset<Row> result = pca.transform(df).select("pcaFeatures");
  result.show(false);
  // $example off$
  spark.stop();
}