org.apache.spark.sql.RowFactory Java Examples

The following examples show how to use org.apache.spark.sql.RowFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatasetBalancerTest.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void test() {
	List<Row> rows = Arrays.asList(
			RowFactory.create("a", 1), RowFactory.create("a", 2), 
			RowFactory.create("b", 1), RowFactory.create("b", 2), RowFactory.create("b", 3), 
			RowFactory.create("c", 1), RowFactory.create("c", 2), RowFactory.create("c", 3), RowFactory.create("c", 4));

	SparkSession spark = SparkSession.builder().master("local[1]").getOrCreate();

	StructType schema = new StructType(
			new StructField[] { DataTypes.createStructField("key", DataTypes.StringType, false),
					DataTypes.createStructField("value", DataTypes.IntegerType, false) });

	Dataset<Row> data = spark.createDataFrame(rows, schema);

	long seed = 19;
	Dataset<Row> balancedData = DatasetBalancer.downsample(data, "key", seed);
	assertTrue(balancedData.count() > 0);
	
    spark.close();
}
 
Example #2
Source File: StructureToSecondaryStructureSegments.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Row t) throws Exception {
	//get information from the input Row
	String structureChainId = t.getString(0);
	String sequence = t.getString(1);
	String dsspQ8 = t.getString(5);
	String dsspQ3 = t.getString(6);
	
	int numSegments = Math.max(0, sequence.length() - length);
	List<Row> sequences = new ArrayList<>(numSegments);
	
	for (int i = 0; i < sequence.length() - length; i++)
	{
		String currSeq = sequence.substring(i, i+length);
		String labelQ8 = dsspQ8.substring(i + length/2,i + length/2 + 1);
		String labelQ3 = dsspQ3.substring(i + length/2,i + length/2 + 1);
		if ( !labelQ8.equals("X") && !labelQ3.equals("X"))
		{
			sequences.add( RowFactory.create(structureChainId, currSeq, labelQ8, labelQ3) );
		}
	}
	return sequences.iterator();
}
 
Example #3
Source File: JavaBinarizerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example #4
Source File: JavaSQLTransformerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaSQLTransformerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 1.0, 3.0),
    RowFactory.create(2, 2.0, 5.0)
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  SQLTransformer sqlTrans = new SQLTransformer().setStatement(
    "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");

  sqlTrans.transform(df).show();
  // $example off$

  spark.stop();
}
 
Example #5
Source File: AtomInteraction.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns rows of pairwise interactions with the central atom.
 * 
 * @return rows of pairwise interactions with the central atom
 */
public List<Row> getPairInteractionsAsRows() {
	List<Row> rows = new ArrayList<>(neighbors.size());

	int length = InteractionCenter.getLength();
	
	calcCoordinationGeometry(neighbors.size());

	// copy data of the interacting atoms
	for (int i = 0; i < neighbors.size(); i++) {
		Object[] data = new Object[2 * length + 2];
		int index = 0;
		data[index++] = structureId;
		System.arraycopy(center.getAsObject(), 0, data, index, length);
		index += length;
		System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length);
		index += length;
		data[index++] = distances[i];
		rows.add(RowFactory.create(data));
	}

	return rows;
}
 
Example #6
Source File: RelationExtractor.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
/**
 * Map a {@link Instance} into an Iterator of all of its relations
 * represented as rows of (related URI, predicate index, type index, instance ID)
 *
 * @param instance the requested {@link Instance}
 * @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID)
 */
private Iterable<Row> getRelatedTypeIDs(Instance instance) {
    // typeIDs representing references to the instance in each table (or a single one, if instance has a single type)
    final Long id = instance.getId();

    final List<Tuple2<Integer, Long>> instanceTypeIDs = getRelationEntityTypes(instance)
            .map(typeIndex -> new Tuple2<>(typeIndex, id))
            .collect(Collectors.toList());

    return instance.getRelations().stream()
            .flatMap(relation ->
                    instanceTypeIDs.stream()
                            .map(instanceTypeID -> RowFactory.create(
                                    relation.getObjectURI(),
                                    relation.getPredicateIndex(),
                                    instanceTypeID._1(),
                                    instanceTypeID._2()
                            ))
            ).collect(Collectors.toList());
}
 
Example #7
Source File: JavaStopWordsRemoverExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaStopWordsRemoverExample")
    .getOrCreate();

  // $example on$
  StopWordsRemover remover = new StopWordsRemover()
    .setInputCol("raw")
    .setOutputCol("filtered");

  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
    RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField(
      "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
  });

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  remover.transform(dataset).show(false);
  // $example off$
  spark.stop();
}
 
Example #8
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Tuple2<Long, FrameBlock> arg0)
	throws Exception 
{
	long rowIndex = arg0._1();
	FrameBlock blk = arg0._2();
	ArrayList<Row> ret = new ArrayList<>();

	//handle Frame block data
	int rows = blk.getNumRows();
	int cols = blk.getNumColumns();
	for( int i=0; i<rows; i++ ) {
		Object[] row = new Object[cols+1];
		row[0] = (double)rowIndex++;
		for( int j=0; j<cols; j++ )
			row[j+1] = blk.get(i, j);
		ret.add(RowFactory.create(row));
	}
	
	return ret.iterator();
}
 
Example #9
Source File: MetadataWriter.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Write metadata describing entity tables
 *
 * @param entitySchema the entity schema
 */
public void writeEntityMetadata(EntitySchema entitySchema) {

    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ENTITIES_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(ENTITIES_LABEL, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(ENTITIES_NUM_ROWS, DataTypes.LongType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_URI));

    List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
    indexes.add(new Tuple2<>(ENTITIES_TABLE_NAME, ENTITIES_NAME));

    final Map<String, String> uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List<Row> rows = entitySchema.getTables().stream()
            .map(table -> {
                Object[] valueArray = new Object[]{
                        table.getName(),
                        table.getTypeURI(),
                        uriLabels.get(table.getTypeURI()),
                        table.getNumRows()
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    // create and write the META_Entities dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(ENTITIES_TABLE_NAME, df);
    persistor.createPrimaryKeys(primaryKeys);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
Example #10
Source File: JavaElementwiseProductExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaElementwiseProductExample")
    .getOrCreate();

  // $example on$
  // Create some vector data; also works for sparse vectors
  List<Row> data = Arrays.asList(
    RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
    RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
  );

  List<StructField> fields = new ArrayList<StructField>(2);
  fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
  fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));

  StructType schema = DataTypes.createStructType(fields);

  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);

  ElementwiseProduct transformer = new ElementwiseProduct()
    .setScalingVec(transformingVector)
    .setInputCol("vector")
    .setOutputCol("transformedVector");

  // Batch transform the vectors to create new column:
  transformer.transform(dataFrame).show();
  // $example off$
  spark.stop();
}
 
Example #11
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteRelationTablesWithPredicateIndex() throws IOException {
    InstanceRelationWriter writer = new InstanceRelationWriter(config
            .setStorePredicate(true), jsc(), persistor, rdfSchema);
    writer.writeRelationTables(getTestRelationSchema(), getTestRelations());

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, 3L, uriIndex.getIndex("http://example.com/knows")));
    rows.add(RowFactory.create(2L, 3L, uriIndex.getIndex("http://example.com/likes")));

    DataFrame result = this.result.values().iterator().next();
    assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(true, false), result.schema());
    assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD());
}
 
Example #12
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteRelationTablesWithoutPredicateIndex() throws IOException {
    InstanceRelationWriter writer = new InstanceRelationWriter(config
            .setStorePredicate(false), jsc(), persistor, rdfSchema);
    writer.writeRelationTables(getTestRelationSchema(), getTestRelations());

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, 3L));
    rows.add(RowFactory.create(2L, 3L));

    DataFrame result = this.result.values().iterator().next();
    assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(false, false), result.schema());
    assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD());
}
 
Example #13
Source File: RelationExtractorTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Test if expected directed relations are collected from a RDD of Instances
 */
@Test
public void testCollectRelations() {
    SQLContext sql = new SQLContext(jsc());

    RelationExtractor collector = new RelationExtractor(
            new RelationConfig(),
            jsc(),
            new ClassGraph()
    );

    List<Row> rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, 1, 1L, 1, 2L));
    rdd.add(RowFactory.create(0, 1, 2L, 1, 3L));
    rdd.add(RowFactory.create(0, 1, 3L, 1, 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, 2, 4L, 1, 1L));
    rdd.add(RowFactory.create(0, 1, 1L, 2, 4L));

    // five -> one
    rdd.add(RowFactory.create(0, 3, 5L, 1, 1L));

    DataFrame expected = sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );

    // (predicateIndex, fromTypeIndex, instanceID, toTypeIndex, relatedID)
    DataFrame result = collector.extractRelations(getTestRDD());

    assertEquals("Expected relation row schema is collected", expected.schema(), result.schema());
    assertRDDEquals("Expected relation rows are collected", expected.javaRDD(), result.javaRDD());
}
 
Example #14
Source File: InstanceRelationWriter.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
private static Row getAttributeRow(Instance instance, Predicate predicate, Object value) {
    return RowFactory.create(
            instance.getId(),
            predicate.getPredicateIndex(),
            LiteralType.toString(predicate.getLiteralType()),
            predicate.getLanguage(),
            value.toString()
    );
}
 
Example #15
Source File: JavaBucketizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketizerExample")
    .getOrCreate();

  // $example on$
  double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

  List<Row> data = Arrays.asList(
    RowFactory.create(-999.9),
    RowFactory.create(-0.5),
    RowFactory.create(-0.3),
    RowFactory.create(0.0),
    RowFactory.create(0.2),
    RowFactory.create(999.9)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Bucketizer bucketizer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits);

  // Transform original data into its bucket index.
  Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

  System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
  bucketedData.show();
  // $example off$

  spark.stop();
}
 
Example #16
Source File: MetadataWriter.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Write metadata describing relation tables
 *
 * @param relationSchema the relation schema
 */
public void writeRelationMetadata(RelationSchema relationSchema) {
    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(RELATIONS_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(RELATIONS_FROM_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(RELATIONS_TO_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(RELATIONS_PREDICATE_ID, DataTypes.IntegerType, true));

    // create table rows
    List<Row> rows = relationSchema.getTables().stream()
            .map(table -> {
                RelationPredicateFilter predicateFilter = table.getPredicateFilter();
                RelationEntityFilter entityFilter = table.getEntityFilter();
                Object[] valueArray = new Object[]{
                        table.getName(),
                        entityFilter == null ? null : entityFilter.getFromTypeName(),
                        entityFilter == null ? null : entityFilter.getToTypeName(),
                        predicateFilter == null ? null : rdfSchema.getPredicateIndex().getIndex(predicateFilter.getPredicateURI())
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    StructType schema = DataTypes.createStructType(fields);

    // add index for each field
    List<Tuple2<String, String>> indexes = fields.stream()
            .map(field -> new Tuple2<>(RELATIONS_TABLE_NAME, field.name()))
            .collect(Collectors.toList());

    // create and write the META_Relations dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(RELATIONS_TABLE_NAME, df);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
Example #17
Source File: MetadataWriter.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Persist predicate metadata table storing all predicates.
 */
public void writePredicateMetadata() {

    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(PREDICATE_ID, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(PREDICATE_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(PREDICATE_LABEL, DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_URI));

    List<Tuple2<String, String>> primaryKeys = new ArrayList<>();
    primaryKeys.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_ID));


    final IndexMap<String> predicateIndex = rdfSchema.getPredicateIndex();
    final Map<String, String> uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List<Row> rows = predicateIndex.getValues().stream()
            .map(uri -> {
                Object[] valueArray = new Object[]{
                        predicateIndex.getIndex(uri),
                        uri,
                        uriLabels.get(uri)
                };
                return RowFactory.create(valueArray);
            }).collect(Collectors.toList());

    // create and write the META_Predicates dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(PREDICATES_TABLE_NAME, df);
    persistor.createPrimaryKeys(primaryKeys);
    persistor.createIndexes(indexes);
    df.unpersist();
}
 
Example #18
Source File: RelationExtractor.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
/**
 * Map all types of an {@link Instance} to a row of (instance URI, instance type index, instance ID)
 *
 * @param instance the requested {@link Instance}
 * @return a row of (instance URI, instance type index, instance ID)
 */
private Iterable<Row> getInstanceTypeIDs(Instance instance) {
    String instanceURI = instance.getUri();
    Long instanceID = instance.getId();

    return getRelationEntityTypes(instance)
            .map(typeIndex -> RowFactory.create(instanceURI, typeIndex, instanceID))
            .collect(Collectors.toList());
}
 
Example #19
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
private JavaRDD<Row> getExpectedRowsOfSingleRelationTable() {
    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, 3L, uriIndex.getIndex("http://example.com/knows")));
    rows.add(RowFactory.create(2L, 3L, uriIndex.getIndex("http://example.com/likes")));

    return jsc().parallelize(rows);
}
 
Example #20
Source File: JavaWord2VecExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaWord2VecExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> documentDF = spark.createDataFrame(data, schema);

  // Learn a mapping from words to Vectors.
  Word2Vec word2Vec = new Word2Vec()
    .setInputCol("text")
    .setOutputCol("result")
    .setVectorSize(3)
    .setMinCount(0);

  Word2VecModel model = word2Vec.fit(documentDF);
  Dataset<Row> result = model.transform(documentDF);

  for (Row row : result.collectAsList()) {
    List<String> text = row.getList(0);
    Vector vector = (Vector) row.get(1);
    System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
  }
  // $example off$

  spark.stop();
}
 
Example #21
Source File: JavaQuantileDiscretizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaQuantileDiscretizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 18.0),
    RowFactory.create(1, 19.0),
    RowFactory.create(2, 8.0),
    RowFactory.create(3, 5.0),
    RowFactory.create(4, 2.2)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("hour", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);
  // $example off$
  // Output of QuantileDiscretizer for such small datasets can depend on the number of
  // partitions. Here we force a single partition to ensure consistent results.
  // Note this is not necessary for normal use cases
  df = df.repartition(1);
  // $example on$
  QuantileDiscretizer discretizer = new QuantileDiscretizer()
    .setInputCol("hour")
    .setOutputCol("result")
    .setNumBuckets(3);

  Dataset<Row> result = discretizer.fit(df).transform(df);
  result.show();
  // $example off$
  spark.stop();
}
 
Example #22
Source File: JavaMinMaxScalerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinMaxScalerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
      RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinMaxScaler scaler = new MinMaxScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures");

  // Compute summary statistics and generate MinMaxScalerModel
  MinMaxScalerModel scalerModel = scaler.fit(dataFrame);

  // rescale each feature to range [min, max].
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  System.out.println("Features scaled to range: [" + scaler.getMin() + ", "
      + scaler.getMax() + "]");
  scaledData.select("features", "scaledFeatures").show();
  // $example off$

  spark.stop();
}
 
Example #23
Source File: JavaTfIdfExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTfIdfExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0.0, "Hi I heard about Spark"),
    RowFactory.create(0.0, "I wish Java could use case classes"),
    RowFactory.create(1.0, "Logistic regression models are neat")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> sentenceData = spark.createDataFrame(data, schema);

  Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
  Dataset<Row> wordsData = tokenizer.transform(sentenceData);

  int numFeatures = 20;
  HashingTF hashingTF = new HashingTF()
    .setInputCol("words")
    .setOutputCol("rawFeatures")
    .setNumFeatures(numFeatures);

  Dataset<Row> featurizedData = hashingTF.transform(wordsData);
  // alternatively, CountVectorizer can also be used to get term frequency vectors

  IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
  IDFModel idfModel = idf.fit(featurizedData);

  Dataset<Row> rescaledData = idfModel.transform(featurizedData);
  rescaledData.select("label", "features").show();
  // $example off$

  spark.stop();
}
 
Example #24
Source File: JavaVectorSlicerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorSlicerExample")
    .getOrCreate();

  // $example on$
  Attribute[] attrs = new Attribute[]{
    NumericAttribute.defaultAttr().withName("f1"),
    NumericAttribute.defaultAttr().withName("f2"),
    NumericAttribute.defaultAttr().withName("f3")
  };
  AttributeGroup group = new AttributeGroup("userFeatures", attrs);

  List<Row> data = Lists.newArrayList(
    RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
    RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
  );

  Dataset<Row> dataset =
    spark.createDataFrame(data, (new StructType()).add(group.toStructField()));

  VectorSlicer vectorSlicer = new VectorSlicer()
    .setInputCol("userFeatures").setOutputCol("features");

  vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
  // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

  Dataset<Row> output = vectorSlicer.transform(dataset);
  output.show(false);
  // $example off$

  spark.stop();
}
 
Example #25
Source File: JavaNormalizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNormalizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  // Normalize each Vector using $L^1$ norm.
  Normalizer normalizer = new Normalizer()
    .setInputCol("features")
    .setOutputCol("normFeatures")
    .setP(1.0);

  Dataset<Row> l1NormData = normalizer.transform(dataFrame);
  l1NormData.show();

  // Normalize each Vector using $L^\infty$ norm.
  Dataset<Row> lInfNormData =
    normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
  lInfNormData.show();
  // $example off$

  spark.stop();
}
 
Example #26
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
private DataFrame getTestRelations() {
    List<Row> rows = new ArrayList<>();

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/knows"),
            uriIndex.getIndex("http://example.com/a"),
            1L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    rows.add(RowFactory.create(
            uriIndex.getIndex("http://example.com/likes"),
            uriIndex.getIndex("http://example.com/a"),
            2L,
            uriIndex.getIndex("http://example.com/b"),
            3L
    ));

    return sql.createDataFrame(rows, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)
    );
}
 
Example #27
Source File: JavaMinHashLSHExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinHashLSHExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
    RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("keys", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinHashLSH mh = new MinHashLSH()
    .setNumHashTables(1)
    .setInputCol("keys")
    .setOutputCol("values");

  MinHashLSHModel model = mh.fit(dataFrame);
  model.transform(dataFrame).show();
  // $example off$

  spark.stop();
}
 
Example #28
Source File: JavaVectorAssemblerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example #29
Source File: JavaRFormulaExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRFormulaExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("country", StringType, false),
    createStructField("hour", IntegerType, false),
    createStructField("clicked", DoubleType, false)
  });

  List<Row> data = Arrays.asList(
    RowFactory.create(7, "US", 18, 1.0),
    RowFactory.create(8, "CA", 12, 0.0),
    RowFactory.create(9, "NZ", 15, 0.0)
  );

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  RFormula formula = new RFormula()
    .setFormula("clicked ~ country + hour")
    .setFeaturesCol("features")
    .setLabelCol("label");
  Dataset<Row> output = formula.fit(dataset).transform(dataset);
  output.select("features", "label").show();
  // $example off$
  spark.stop();
}
 
Example #30
Source File: QuaternaryStructureDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
private static Iterator<Row> getQuaternaryStructure(Tuple2<String, StructureDataInterface> t) throws Exception {
	List<Row> rows = new ArrayList<>();
    String key = t._1;
	StructureDataInterface structure = t._2;
	ColumnarStructure cs = new ColumnarStructure(structure, true);
	String[] chainEntityTypes = cs.getChainEntityTypes();
	int[] chainToEntityIndex = cs.getChainToEntityIndices();
	
	for (int i = 0; i < structure.getNumBioassemblies(); i++) {
	    List<Integer> proteinIndices = new ArrayList<>();
	    List<Integer> dnaIndices = new ArrayList<>();
	    List<Integer> rnaIndices = new ArrayList<>();
	   
	    for (int j = 0; j < structure.getNumTransInBioassembly(i); j++) {
	        for (int chainIndex : structure.getChainIndexListForTransform(i, j)) {
	            int entityIndex = chainToEntityIndex[chainIndex];
	            String type = chainEntityTypes[chainIndex];
	            if (type.equals("PRO")) {
	                proteinIndices.add(entityIndex);
	            } else if (type.equals("DNA")) {
	                dnaIndices.add(entityIndex);
	            } else if (type.equals("RNA")) {
	                rnaIndices.add(entityIndex);
	            }
	        }
	    }
	    
	    String proStoich = stoichiometry(coefficients(proteinIndices));
        String dnaStoich = stoichiometry(coefficients(dnaIndices));
        String rnaStoich = stoichiometry(coefficients(rnaIndices));
	    rows.add(RowFactory.create(key, structure.getBioassemblyName(i), proStoich, dnaStoich, rnaStoich));
	}

	return rows.iterator();
}