org.apache.spark.sql.types.StructField Java Examples

The following examples show how to use org.apache.spark.sql.types.StructField. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextUtil.java    From systemds with Apache License 2.0 7 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example #2
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with ID column, no format specified");

	List<String> list = new ArrayList<>();
	list.add("1,2,2,2");
	list.add("2,3,3,3");
	list.add("3,4,4,4");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 27.0");
	ml.execute(script);
}
 
Example #3
Source File: SchemaConverter.java    From geowave with Apache License 2.0 6 votes vote down vote up
public static SimpleFeatureType schemaToFeatureType(
    final StructType schema,
    final String typeName) {
  final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder();
  typeBuilder.setName(typeName);
  typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE);
  try {
    typeBuilder.setCRS(CRS.decode("EPSG:4326", true));
  } catch (final FactoryException e) {
    LOGGER.error(e.getMessage(), e);
  }

  final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder();

  for (final StructField field : schema.fields()) {
    final AttributeDescriptor attrDesc = attrDescFromStructField(attrBuilder, field);

    typeBuilder.add(attrDesc);
  }

  return typeBuilder.buildFeatureType();
}
 
Example #4
Source File: UnaryTransformer.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
@Override
public StructType transformSchema(StructType structType) {
    String inputCol = getInputCol();
    String outputCol = getOutputCol();
    DataType inputType = structType.apply(inputCol).dataType();
    this.validateInputType(inputType);
    List<String> names = Arrays.asList(structType.fieldNames());
    Cond.require(!names.contains(outputCol), "The output column " + outputCol + " already exists in this schema!");
    List<StructField> fields = new ArrayList<>();
    for (int i = 0; i < structType.fields().length; i++) {
        fields.add(structType.fields()[i]);
    }
    DataType dt = getOutputDataType();
    fields.add(DataTypes.createStructField(outputCol, dt, isOutputDataTypeNullable()));
    return DataTypes.createStructType(fields);
}
 
Example #5
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #6
Source File: TestChainedTransformer.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testChainedTransformation() {
  StructType schema = DataTypes.createStructType(
      new StructField[] {
          createStructField("foo", StringType, false)
      });
  Row r1 = RowFactory.create("100");
  Row r2 = RowFactory.create("200");
  Dataset<Row> original = sparkSession.sqlContext().createDataFrame(Arrays.asList(r1, r2), schema);

  Transformer t1 = (jsc, sparkSession, dataset, properties) -> dataset.withColumnRenamed("foo", "bar");
  Transformer t2 = (jsc, sparkSession, dataset, properties) -> dataset.withColumn("bar", dataset.col("bar").cast(IntegerType));
  ChainedTransformer transformer = new ChainedTransformer(Arrays.asList(t1, t2));
  Dataset<Row> transformed = transformer.apply(jsc, sparkSession, original, null);

  assertEquals(2, transformed.count());
  assertArrayEquals(new String[] {"bar"}, transformed.columns());
  List<Row> rows = transformed.collectAsList();
  assertEquals(100, rows.get(0).getInt(0));
  assertEquals(200, rows.get(1).getInt(0));
}
 
Example #7
Source File: TestRowUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testRemoveOneField() {
  StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true);
  StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true);
  StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true);
  StructType removeSchema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3));
  Row remove = new RowWithSchema(removeSchema, "hello", 1, 1.0);

  Row removed = RowUtils.remove(remove, "field2");

  Row expected = new RowWithSchema(
      DataTypes.createStructType(Lists.newArrayList(field1, field3)),
      "hello", 1.0);

  assertEquals(expected, removed);
}
 
Example #8
Source File: IndexRUtil.java    From indexr with Apache License 2.0 6 votes vote down vote up
public static SegmentSchema sparkSchemaToIndexRSchema(List<StructField> sparkSchema, IsIndexed isIndexed) {
    List<ColumnSchema> columns = new ArrayList<>();
    for (StructField f : sparkSchema) {
        SQLType type;
        if (f.dataType() instanceof IntegerType) {
            type = SQLType.INT;
        } else if (f.dataType() instanceof LongType) {
            type = SQLType.BIGINT;
        } else if (f.dataType() instanceof FloatType) {
            type = SQLType.FLOAT;
        } else if (f.dataType() instanceof DoubleType) {
            type = SQLType.DOUBLE;
        } else if (f.dataType() instanceof StringType) {
            type = SQLType.VARCHAR;
        } else if (f.dataType() instanceof DateType) {
            type = SQLType.DATE;
        } else if (f.dataType() instanceof TimestampType) {
            type = SQLType.DATETIME;
        } else {
            throw new IllegalStateException("Unsupported type: " + f.dataType());
        }
        columns.add(new ColumnSchema(f.name(), type, isIndexed.apply(f.name())));
    }
    return new SegmentSchema(columns);
}
 
Example #9
Source File: TestDecisionStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneByStepValueFalse() {
  StructType schema = new StructType(new StructField[] {
      new StructField("outcome", DataTypes.BooleanType, false, Metadata.empty())
  });
  List<Row> rows = Lists.newArrayList(
      RowFactory.create(false)
  );
  Dataset<Row> ds = Contexts.getSparkSession().createDataFrame(rows, schema);
  step1.setData(ds);

  Map<String, Object> step2ConfigMap = Maps.newHashMap();
  step2ConfigMap.put(Step.DEPENDENCIES_CONFIG, Lists.newArrayList("step1"));
  step2ConfigMap.put(DecisionStep.IF_TRUE_STEP_NAMES_PROPERTY, Lists.newArrayList("step3", "step7"));
  step2ConfigMap.put(DecisionStep.DECISION_METHOD_PROPERTY, DecisionStep.STEP_BY_VALUE_DECISION_METHOD);
  step2ConfigMap.put(DecisionStep.STEP_BY_VALUE_STEP_PROPERTY, "step1");
  Config step2Config = ConfigFactory.parseMap(step2ConfigMap);
  RefactorStep step2 = new DecisionStep("step2");
  step2.configure(step2Config);
  steps.add(step2);

  Set<Step> refactored = step2.refactor(steps);

  assertEquals(refactored, Sets.newHashSet(step1, step2, step5, step6));
}
 
Example #10
Source File: SimpleFeatureMapper.java    From geowave with Apache License 2.0 6 votes vote down vote up
@Override
public Row call(final SimpleFeature feature) throws Exception {
  final Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
    final Object fieldObj = feature.getAttribute(i);
    if (fieldObj != null) {
      final StructField structField = schema.apply(i);
      if (structField.name().equals("geom")) {
        fields[i] = fieldObj;
      } else if (structField.dataType() == DataTypes.TimestampType) {
        fields[i] = new Timestamp(((Date) fieldObj).getTime());
      } else if (structField.dataType() != null) {
        fields[i] = fieldObj;
      } else {
        LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
      }
    }
  }

  return new GenericRowWithSchema(fields, schema);
}
 
Example #11
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #12
Source File: RDDConverterUtilsExtTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testStringDataFrameToVectorDataFrameNull() {
	List<String> list = new ArrayList<>();
	list.add("[1.2, 3.4]");
	list.add(null);
	JavaRDD<String> javaRddString = sc.parallelize(list);
	JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow());
	SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema);
	Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF);

	List<String> expectedResults = new ArrayList<>();
	expectedResults.add("[[1.2,3.4]]");
	expectedResults.add("[null]");

	List<Row> outputList = outDF.collectAsList();
	for (Row row : outputList) {
		assertTrue("Expected results don't contain: " + row, expectedResults.contains(row.toString()));
	}
}
 
Example #13
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}
 
Example #14
Source File: MLContextUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example #15
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #16
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}
 
Example #17
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with ID column, no format specified");

	List<String> list = new ArrayList<>();
	list.add("1,2,2,2");
	list.add("2,3,3,3");
	list.add("3,4,4,4");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 27.0");
	ml.execute(script);
}
 
Example #18
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified");

	List<String> list = new ArrayList<>();
	list.add("2,2,2");
	list.add("3,3,3");
	list.add("4,4,4");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 27.0");
	ml.execute(script);
}
 
Example #19
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testInputMatrixBlockDML() {
	System.out.println("MLContextTest - input MatrixBlock DML");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Matrix m = new Matrix(dataFrame);
	MatrixBlock matrixBlock = m.toMatrixBlock();
	Script script = dml("avg = avg(M);").in("M", matrixBlock).out("avg");
	double avg = ml.execute(script).getDouble("avg");
	Assert.assertEquals(50.0, avg, 0.0);
}
 
Example #20
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example #21
Source File: QuaternaryStructureDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
* Returns a dataset with quaternary structure info
* 
* @param structure 
* @return dataset quaternary structure info
*/
  public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) {
      JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t));
      
      StructType schema = new StructType(new StructField[]{
              new StructField("structureId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
      });
      
      SparkSession spark = SparkSession.builder().getOrCreate();
      return spark.createDataFrame(rows, schema);
  }
 
Example #22
Source File: ParquetWithSparkSchemaVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static <T> T visitField(StructField sField, Type field, ParquetWithSparkSchemaVisitor<T> visitor) {
  visitor.fieldNames.push(field.getName());
  try {
    return visit(sField.dataType(), field, visitor);
  } finally {
    visitor.fieldNames.pop();
  }
}
 
Example #23
Source File: SchemaConverter.java    From toolbox with Apache License 2.0 5 votes vote down vote up
static StructType getSchema(Attributes atts) {

        // Generate the schema based on the list of attributes and depending on their type:
        List<StructField> fields = new ArrayList<StructField>();

        for (Attribute att: atts.getFullListOfAttributes()) {

            if (att.getStateSpaceType().getStateSpaceTypeEnum() == REAL)
                fields.add(DataTypes.createStructField(att.getName(), DataTypes.DoubleType, true));
            else
                fields.add(DataTypes.createStructField(att.getName(), DataTypes.StringType, true));

        }
        return DataTypes.createStructType(fields);
    }
 
Example #24
Source File: InstanceRelationWriterTest.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
private StructType getExpectedSchemaOfAB(boolean isPredicateStored, boolean isPredicateStoredAsURI) {
    List<StructField> fields = new ArrayList<>();

    fields.add(DataTypes.createStructField("a" + ID_SUFFIX_JOINER + ID_COLUMN_NAME, DataTypes.LongType, false));
    fields.add(DataTypes.createStructField("b" + ID_SUFFIX_JOINER + ID_COLUMN_NAME, DataTypes.LongType, false));
    if (isPredicateStored) {
        fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, isPredicateStoredAsURI ? DataTypes.StringType : DataTypes.IntegerType, false));
    }

    return DataTypes.createStructType(fields);
}
 
Example #25
Source File: InteractionCenter.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a schema to create Spark Datasets. This schema must match the
 * order in which the data are return by the {@code getAsObject()} method.
 * 
 * @param index
 *            an integer index to label an interaction center
 * @return schema to represent an interaction center in a Spark Dataset.
 */
public static StructField[] getStructFields(int index) {
    boolean nullable = true;
    return new StructField[] { DataTypes.createStructField("atom" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("element" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("group" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("groupNum" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("type" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("chain" + index, DataTypes.StringType, nullable),
            DataTypes.createStructField("nbFactor" + index, DataTypes.FloatType, nullable)};
}
 
Example #26
Source File: JavaRFormulaExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRFormulaExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("country", StringType, false),
    createStructField("hour", IntegerType, false),
    createStructField("clicked", DoubleType, false)
  });

  List<Row> data = Arrays.asList(
    RowFactory.create(7, "US", 18, 1.0),
    RowFactory.create(8, "CA", 12, 0.0),
    RowFactory.create(9, "NZ", 15, 0.0)
  );

  Dataset<Row> dataset = spark.createDataFrame(data, schema);
  RFormula formula = new RFormula()
    .setFormula("clicked ~ country + hour")
    .setFeaturesCol("features")
    .setLabelCol("label");
  Dataset<Row> output = formula.fit(dataset).transform(dataset);
  output.select("features", "label").show();
  // $example off$
  spark.stop();
}
 
Example #27
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testAgeRangeInt() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", 34, new BigDecimal("0.00"));
  assertTrue("Row should pass rule", rule.check(row1));

  Row row2 = new RowWithSchema(schema, "Webster1", "Websta1", 110, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row2));

  Row row3 = new RowWithSchema(schema, "", "Ian1", 106, new BigDecimal("450.10"));
  assertFalse("Row should not pass rule", rule.check(row3));

  Row row4 = new RowWithSchema(schema, "First Last", "Ian Last", 105, new BigDecimal("450.10"));
  assertTrue("Row should pass rule", rule.check(row4));
}
 
Example #28
Source File: JavaBucketizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketizerExample")
    .getOrCreate();

  // $example on$
  double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

  List<Row> data = Arrays.asList(
    RowFactory.create(-999.9),
    RowFactory.create(-0.5),
    RowFactory.create(-0.3),
    RowFactory.create(0.0),
    RowFactory.create(0.2),
    RowFactory.create(999.9)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Bucketizer bucketizer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits);

  // Transform original data into its bucket index.
  Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

  System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
  bucketedData.show();
  // $example off$

  spark.stop();
}
 
Example #29
Source File: SparkTypeVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static <T> T visit(DataType type, SparkTypeVisitor<T> visitor) {
  if (type instanceof StructType) {
    StructField[] fields = ((StructType) type).fields();
    List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.length);

    for (StructField field : fields) {
      fieldResults.add(visitor.field(
          field,
          visit(field.dataType(), visitor)));
    }

    return visitor.struct((StructType) type, fieldResults);

  } else if (type instanceof MapType) {
    return visitor.map((MapType) type,
        visit(((MapType) type).keyType(), visitor),
        visit(((MapType) type).valueType(), visitor));

  } else if (type instanceof ArrayType) {
    return visitor.array(
        (ArrayType) type,
        visit(((ArrayType) type).elementType(), visitor));

  } else if (type instanceof UserDefinedType) {
    throw new UnsupportedOperationException(
        "User-defined types are not supported");

  } else {
    return visitor.atomic(type);
  }
}
 
Example #30
Source File: RDDConverterUtilsExtTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test(expected = SparkException.class)
public void testStringDataFrameToVectorDataFrameNonNumbers() {
	List<String> list = new ArrayList<>();
	list.add("[cheeseburger,fries]");
	JavaRDD<String> javaRddString = sc.parallelize(list);
	JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow());
	SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema);
	Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF);
	// trigger evaluation to throw exception
	outDF.collectAsList();
}