Java Code Examples for org.apache.spark.sql.types.DataTypes

The following examples show how to use org.apache.spark.sql.types.DataTypes. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 2
Source Project: envelope   Source File: TestMorphlineTranslator.java    License: Apache License 2.0 6 votes vote down vote up
@Test (expected = MorphlineCompilationException.class)
public void invalidCommand() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "invalid-command");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType,
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}
 
Example 3
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}
 
Example 4
private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example 5
Source Project: stocator   Source File: TestSuite.java    License: Apache License 2.0 6 votes vote down vote up
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example 6
Source Project: geowave   Source File: SchemaConverter.java    License: Apache License 2.0 6 votes vote down vote up
public static StructType schemaFromFeatureType(final SimpleFeatureType featureType) {
  final List<StructField> fields = new ArrayList<>();

  for (final AttributeDescriptor attrDesc : featureType.getAttributeDescriptors()) {
    final SimpleFeatureDataType sfDataType = attrDescToDataType(attrDesc);

    final String fieldName = (sfDataType.isGeom() ? "geom" : attrDesc.getName().getLocalPart());

    final StructField field =
        DataTypes.createStructField(fieldName, sfDataType.getDataType(), true);

    fields.add(field);
  }

  if (fields.isEmpty()) {
    LOGGER.error("Feature type produced empty dataframe schema!");
    return null;
  }

  return DataTypes.createStructType(fields);
}
 
Example 7
Source Project: systemds   Source File: MLContextUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example 8
Source Project: envelope   Source File: TestAvroUtils.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void toTypeSchemaStructTypeFieldNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.createStructType(
      Lists.newArrayList(
          DataTypes.createStructField("field1", DataTypes.StringType, true)
      )),
      false);

  assertEquals("Invalid type", Schema.Type.RECORD, schema.getType());
  assertEquals("Invalid record name", "record0", schema.getName());
  assertEquals("Invalid field count", 1, schema.getFields().size());
  assertEquals("Invalid field name", "field1", schema.getFields().get(0).name());
  assertEquals("Invalid field type", Schema.Type.UNION, schema.getFields().get(0).schema().getType());

  for (Schema s : schema.getFields().get(0).schema().getTypes()) {
    assertThat("Invalid union types", s.getType(), anyOf(is(Schema.Type.STRING), is(Schema.Type.NULL)));
  }

  //System.out.println(schema.toString(true));
}
 
Example 9
Source Project: envelope   Source File: TestRowUtils.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testToRowValueDate() {
  DataType field = DataTypes.DateType;

  DateTime dateObj = DateTime.parse("2017-01-01T00:00:00"); // Pass-thru the TZ
  Date sqlDate = new Date(dateObj.getMillis());

  assertEquals("Invalid Long", sqlDate, RowUtils.toRowValue(dateObj.getMillis(), field));
  assertEquals("Invalid String", sqlDate, RowUtils.toRowValue("2017-001", field)); // ISO Date format
  assertEquals("Invalid Date", sqlDate, RowUtils.toRowValue(dateObj.toDate(), field));
  assertEquals("Invalid DateTime", sqlDate, RowUtils.toRowValue(dateObj, field));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage(CoreMatchers.containsString("Invalid or unrecognized input format"));
  RowUtils.toRowValue(123, field);
}
 
Example 10
Source Project: envelope   Source File: TestDelimitedSerializer.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDelimitedSerialization() {
  List<StructField> fields = Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.StringType, true),
      DataTypes.createStructField("field2", DataTypes.IntegerType, true),
      DataTypes.createStructField("field3", DataTypes.BooleanType, true)
  );
  Row row = new RowWithSchema(DataTypes.createStructType(fields), "hello", 1, false);
  
  Map<String, String> configs = Maps.newHashMap();
  configs.put(DelimitedSerializer.FIELD_DELIMITER_CONFIG_NAME, "||");
  Serializer<Row> serializer = new DelimitedSerializer();
  serializer.configure(configs, false);
  
  byte[] serialized = serializer.serialize("test", row);
  serializer.close();
  
  assertEquals(new String(serialized), "hello||1||false");
}
 
Example 11
Source Project: envelope   Source File: TestMorphlineTranslator.java    License: Apache License 2.0 6 votes vote down vote up
@Test (expected = MorphlineRuntimeException.class)
public void noRecordReturned() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "no-return");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType, 
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}
 
Example 12
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 13
Source Project: envelope   Source File: TestConfigurationDataTypes.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testGetSparkDataTypeValid() {
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DECIMAL), new DecimalType());
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal(38,38)"), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal ( 38 , 38 ) "), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.STRING), DataTypes.StringType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.FLOAT), DataTypes.FloatType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DOUBLE), DataTypes.DoubleType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BYTE), DataTypes.ByteType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.SHORT), DataTypes.ShortType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.INT), DataTypes.IntegerType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.LONG), DataTypes.LongType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BOOLEAN), DataTypes.BooleanType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BINARY), DataTypes.BinaryType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DATE), DataTypes.DateType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.TIMESTAMP), DataTypes.TimestampType);
}
 
Example 14
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 15
Source Project: envelope   Source File: TestEventTimeUpsertPlanner.java    License: Apache License 2.0 6 votes vote down vote up
@Before
public void before() { 
  arriving = Lists.newArrayList();
  existing = Lists.newArrayList();

  keySchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false)));
  recordSchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false),
    DataTypes.createStructField("value", DataTypes.StringType, true),
    DataTypes.createStructField("timestamp", DataTypes.LongType, true)));

  configMap = Maps.newHashMap();
  configMap.put(EventTimeUpsertPlanner.KEY_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("key"));
  configMap.put(EventTimeUpsertPlanner.VALUE_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("value"));
  configMap.put(EventTimeUpsertPlanner.TIMESTAMP_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("timestamp"));
  config = ConfigFactory.parseMap(configMap);
}
 
Example 16
Source Project: mmtf-spark   Source File: AtomInteraction.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for a row of pairwise atom interactions. 
 * The schema is used to create a Dataset<Row> from the row information.
 * 
 * @return schema for dataset
 */
public static StructType getPairInteractionSchema() {
	int length = InteractionCenter.getLength();
	StructField[] sf = new StructField[2 * length + 2];

	int index = 0;
	sf[index++] = DataTypes.createStructField("pdbId", DataTypes.StringType, false);

	// copy schema info for query atom
	System.arraycopy(InteractionCenter.getStructFields(0), 0, sf, index, length);
	index += length;

	// copy schema info for interacting atoms and their distance
	System.arraycopy(InteractionCenter.getStructFields(1), 0, sf, index, length);
	index += length;
	sf[index++] = DataTypes.createStructField("distance1", DataTypes.FloatType, true);

	return new StructType(sf);
}
 
Example 17
Source Project: geowave   Source File: SimpleFeatureMapper.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Row call(final SimpleFeature feature) throws Exception {
  final Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
    final Object fieldObj = feature.getAttribute(i);
    if (fieldObj != null) {
      final StructField structField = schema.apply(i);
      if (structField.name().equals("geom")) {
        fields[i] = fieldObj;
      } else if (structField.dataType() == DataTypes.TimestampType) {
        fields[i] = new Timestamp(((Date) fieldObj).getTime());
      } else if (structField.dataType() != null) {
        fields[i] = fieldObj;
      } else {
        LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
      }
    }
  }

  return new GenericRowWithSchema(fields, schema);
}
 
Example 18
Source Project: envelope   Source File: TestRowUtils.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRemoveOneField() {
  StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true);
  StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true);
  StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true);
  StructType removeSchema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3));
  Row remove = new RowWithSchema(removeSchema, "hello", 1, 1.0);

  Row removed = RowUtils.remove(remove, "field2");

  Row expected = new RowWithSchema(
      DataTypes.createStructType(Lists.newArrayList(field1, field3)),
      "hello", 1.0);

  assertEquals(expected, removed);
}
 
Example 19
Source Project: deeplearning4j   Source File: DataFrames.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example 20
Source Project: rdf2x   Source File: InstanceRelationWriter.java    License: Apache License 2.0 6 votes vote down vote up
private DataType getDataType(int type) {
    switch (type) {
        case LiteralType.BOOLEAN:
            return DataTypes.BooleanType;
        case LiteralType.STRING:
            return DataTypes.StringType;
        case LiteralType.FLOAT:
            return DataTypes.FloatType;
        case LiteralType.DOUBLE:
            return DataTypes.DoubleType;
        case LiteralType.INTEGER:
            return DataTypes.IntegerType;
        case LiteralType.LONG:
            return DataTypes.LongType;
        case LiteralType.DATETIME:
            // datetime not supported due to timezone issues with java.sql.Timestamp
            // check the InstanceAggregator for more info
            return DataTypes.StringType;
    }
    throw new NotImplementedException("Not able to write literal type " + type);
}
 
Example 21
Source Project: envelope   Source File: TestProtobufTranslator.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void translateMultiple() throws Exception {
  String descPath = TestProtobufTranslator.class.getResource(MULTIPLE_EXAMPLE).getPath();

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "protobuf");
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + 
      ProtobufSchema.DESCRIPTOR_FILEPATH_CONFIG, descPath);
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." +
      ProtobufSchema.DESCRIPTOR_MESSAGE_CONFIG, "OtherExample");
  Config config = ConfigFactory.parseMap(configMap);

  ProtobufTranslator translator = new ProtobufTranslator();
  assertNoValidationFailures(translator, config);
  translator.configure(config);

  byte[] key = "foo".getBytes();
  byte[] payload = Files.readAllBytes(MULTIPLE_UNCOMPRESSED.toPath());

  Row raw = TestingMessageFactory.get(key, DataTypes.BinaryType, payload, DataTypes.BinaryType);
  Iterable<Row> results = translator.translate(raw);

  assertThat(results.iterator().hasNext(), is(true));
  Row row = results.iterator().next();
  assertThat(row.getString(0), is("other"));
}
 
Example 22
Source Project: bpmn.ai   Source File: TypeCastStep.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private DataType mapDataType(List<StructField> datasetFields, String column, String typeConfig) {

        DataType currentDatatype = getCurrentDataType(datasetFields, column);

        // when typeConfig is null (no config for this column), return the current DataType
        if(typeConfig == null) {
            return currentDatatype;
        }

        switch (typeConfig) {
            case "integer":
                return DataTypes.IntegerType;
            case "long":
                return DataTypes.LongType;
            case "double":
                return DataTypes.DoubleType;
            case "boolean":
                return DataTypes.BooleanType;
            case "date":
                return DataTypes.DateType;
            case "timestamp":
                return DataTypes.TimestampType;
            default:
                return DataTypes.StringType;
        }
    }
 
Example 23
Source Project: hudi   Source File: TestFlatteningTransformer.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}
 
Example 24
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 450.0");
	ml.execute(script);
}
 
Example 25
Source Project: systemds   Source File: MLContextFrameTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTransform() {
	System.out.println("MLContextFrameTest - transform");
	
	Row[] rowsA = {RowFactory.create("\"`@(\"(!&",2,"20news-bydate-train/comp.os.ms-windows.misc/9979"),
			RowFactory.create("\"`@(\"\"(!&\"",3,"20news-bydate-train/comp.os.ms-windows.misc/9979")};

	JavaRDD<Row> javaRddRowA = sc. parallelize( Arrays.asList(rowsA)); 

	List<StructField> fieldsA = new ArrayList<>();
	fieldsA.add(DataTypes.createStructField("featureName", DataTypes.StringType, true));
	fieldsA.add(DataTypes.createStructField("featureValue", DataTypes.IntegerType, true));
	fieldsA.add(DataTypes.createStructField("id", DataTypes.StringType, true));
	StructType schemaA = DataTypes.createStructType(fieldsA);
	Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);

	String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ featureName, id ]}\");";

	Script script = dml(dmlString)
			.in("A", dataFrameA,
					new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length))
			.out("tA").out("tAM");
	ml.setExplain(true);
	ml.setExplainLevel(ExplainLevel.RECOMPILE_HOPS);
	MLResults results = ml.execute(script);

	double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
	Assert.assertEquals(1.0, matrixtA[0][2], 0.0);

	Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
	System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
	dataFrame_tA.printSchema();
	dataFrame_tA.show();
	
	Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
	System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
	dataFrame_tAM.printSchema();
	dataFrame_tAM.show();
}
 
Example 26
Source Project: sparkResearch   Source File: CustomDataFrame.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}
 
Example 27
Source Project: envelope   Source File: TestRowUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testToRowValueNull() {
  DataType field = DataTypes.NullType;

  assertEquals("Invalid NULL", null, RowUtils.toRowValue(null, field));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage("Invalid or unrecognized input format");
  RowUtils.toRowValue(ByteBuffer.allocate(1), field);
}
 
Example 28
Source Project: SparkDemo   Source File: JavaChiSqSelectorExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaChiSqSelectorExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
    RowFactory.create(8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
    RowFactory.create(9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
    new StructField("clicked", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  ChiSqSelector selector = new ChiSqSelector()
    .setNumTopFeatures(1)
    .setFeaturesCol("features")
    .setLabelCol("clicked")
    .setOutputCol("selectedFeatures");

  Dataset<Row> result = selector.fit(df).transform(df);

  System.out.println("ChiSqSelector output with top " + selector.getNumTopFeatures()
      + " features selected");
  result.show();

  // $example off$
  spark.stop();
}
 
Example 29
Source Project: envelope   Source File: TestAvroUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void toDataTypeRecordOptional() {
  Schema record = SchemaBuilder.record("test").fields()
      .name("field1").type().optional().intType()
      .endRecord();

  assertEquals("Invalid DataType", DataTypes.createStructType(Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.IntegerType, true)
  )), AvroUtils.dataTypeFor(record));
}
 
Example 30
Source Project: SparkDemo   Source File: JavaAFTSurvivalRegressionExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaAFTSurvivalRegressionExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)),
    RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)),
    RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)),
    RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)),
    RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> training = spark.createDataFrame(data, schema);
  double[] quantileProbabilities = new double[]{0.3, 0.6};
  AFTSurvivalRegression aft = new AFTSurvivalRegression()
    .setQuantileProbabilities(quantileProbabilities)
    .setQuantilesCol("quantiles");

  AFTSurvivalRegressionModel model = aft.fit(training);

  // Print the coefficients, intercept and scale parameter for AFT survival regression
  System.out.println("Coefficients: " + model.coefficients());
  System.out.println("Intercept: " + model.intercept());
  System.out.println("Scale: " + model.scale());
  model.transform(training).show(false);
  // $example off$

  spark.stop();
}