org.apache.spark.sql.types.DataTypes Java Examples

The following examples show how to use org.apache.spark.sql.types.DataTypes. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextUtil.java    From systemds with Apache License 2.0 7 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example #2
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #3
Source File: DataFrames.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a datavec schema to a
 * struct type in spark
 *
 * @param schema the schema to convert
 * @return the datavec struct type
 */
public static StructType fromSchema(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns()];
    for (int i = 0; i < structFields.length; i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i] = new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i] = new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i] = new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example #4
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTuple1DML() {
	System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
	JavaRDD<String> javaRddString = sc
			.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("N=M*2").in("M", df).out("N");
	Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
	double[][] n = tuple._1().to2DDoubleArray();
	Assert.assertEquals(2.0, n[0][0], 0);
	Assert.assertEquals(4.0, n[0][1], 0);
	Assert.assertEquals(6.0, n[0][2], 0);
	Assert.assertEquals(8.0, n[1][0], 0);
	Assert.assertEquals(10.0, n[1][1], 0);
	Assert.assertEquals(12.0, n[1][2], 0);
	Assert.assertEquals(14.0, n[2][0], 0);
	Assert.assertEquals(16.0, n[2][1], 0);
	Assert.assertEquals(18.0, n[2][2], 0);
}
 
Example #5
Source File: NGlobalDictionaryV2Test.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}
 
Example #6
Source File: InstanceRelationWriter.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
private DataType getDataType(int type) {
    switch (type) {
        case LiteralType.BOOLEAN:
            return DataTypes.BooleanType;
        case LiteralType.STRING:
            return DataTypes.StringType;
        case LiteralType.FLOAT:
            return DataTypes.FloatType;
        case LiteralType.DOUBLE:
            return DataTypes.DoubleType;
        case LiteralType.INTEGER:
            return DataTypes.IntegerType;
        case LiteralType.LONG:
            return DataTypes.LongType;
        case LiteralType.DATETIME:
            // datetime not supported due to timezone issues with java.sql.Timestamp
            // check the InstanceAggregator for more info
            return DataTypes.StringType;
    }
    throw new NotImplementedException("Not able to write literal type " + type);
}
 
Example #7
Source File: TestRowUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testRemoveOneField() {
  StructField field1 = DataTypes.createStructField("field1", DataTypes.StringType, true);
  StructField field2 = DataTypes.createStructField("field2", DataTypes.IntegerType, true);
  StructField field3 = DataTypes.createStructField("field3", DataTypes.FloatType, true);
  StructType removeSchema = DataTypes.createStructType(Lists.newArrayList(field1, field2, field3));
  Row remove = new RowWithSchema(removeSchema, "hello", 1, 1.0);

  Row removed = RowUtils.remove(remove, "field2");

  Row expected = new RowWithSchema(
      DataTypes.createStructType(Lists.newArrayList(field1, field3)),
      "hello", 1.0);

  assertEquals(expected, removed);
}
 
Example #8
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example #9
Source File: SimpleFeatureMapper.java    From geowave with Apache License 2.0 6 votes vote down vote up
@Override
public Row call(final SimpleFeature feature) throws Exception {
  final Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
    final Object fieldObj = feature.getAttribute(i);
    if (fieldObj != null) {
      final StructField structField = schema.apply(i);
      if (structField.name().equals("geom")) {
        fields[i] = fieldObj;
      } else if (structField.dataType() == DataTypes.TimestampType) {
        fields[i] = new Timestamp(((Date) fieldObj).getTime());
      } else if (structField.dataType() != null) {
        fields[i] = fieldObj;
      } else {
        LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
      }
    }
  }

  return new GenericRowWithSchema(fields, schema);
}
 
Example #10
Source File: TestMorphlineTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test (expected = MorphlineCompilationException.class)
public void invalidCommand() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "invalid-command");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType,
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}
 
Example #11
Source File: SchemaConverter.java    From geowave with Apache License 2.0 6 votes vote down vote up
public static StructType schemaFromFeatureType(final SimpleFeatureType featureType) {
  final List<StructField> fields = new ArrayList<>();

  for (final AttributeDescriptor attrDesc : featureType.getAttributeDescriptors()) {
    final SimpleFeatureDataType sfDataType = attrDescToDataType(attrDesc);

    final String fieldName = (sfDataType.isGeom() ? "geom" : attrDesc.getName().getLocalPart());

    final StructField field =
        DataTypes.createStructField(fieldName, sfDataType.getDataType(), true);

    fields.add(field);
  }

  if (fields.isEmpty()) {
    LOGGER.error("Feature type produced empty dataframe schema!");
    return null;
  }

  return DataTypes.createStructType(fields);
}
 
Example #12
Source File: TestEventTimeUpsertPlanner.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Before
public void before() { 
  arriving = Lists.newArrayList();
  existing = Lists.newArrayList();

  keySchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false)));
  recordSchema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("key", DataTypes.StringType, false),
    DataTypes.createStructField("value", DataTypes.StringType, true),
    DataTypes.createStructField("timestamp", DataTypes.LongType, true)));

  configMap = Maps.newHashMap();
  configMap.put(EventTimeUpsertPlanner.KEY_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("key"));
  configMap.put(EventTimeUpsertPlanner.VALUE_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("value"));
  configMap.put(EventTimeUpsertPlanner.TIMESTAMP_FIELD_NAMES_CONFIG_NAME, Lists.newArrayList("timestamp"));
  config = ConfigFactory.parseMap(configMap);
}
 
Example #13
Source File: TestAvroUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void toTypeSchemaStructTypeFieldNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.createStructType(
      Lists.newArrayList(
          DataTypes.createStructField("field1", DataTypes.StringType, true)
      )),
      false);

  assertEquals("Invalid type", Schema.Type.RECORD, schema.getType());
  assertEquals("Invalid record name", "record0", schema.getName());
  assertEquals("Invalid field count", 1, schema.getFields().size());
  assertEquals("Invalid field name", "field1", schema.getFields().get(0).name());
  assertEquals("Invalid field type", Schema.Type.UNION, schema.getFields().get(0).schema().getType());

  for (Schema s : schema.getFields().get(0).schema().getTypes()) {
    assertThat("Invalid union types", s.getType(), anyOf(is(Schema.Type.STRING), is(Schema.Type.NULL)));
  }

  //System.out.println(schema.toString(true));
}
 
Example #14
Source File: TestFlatteningTransformer.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}
 
Example #15
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #16
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column");

	List<String> list = new ArrayList<>();
	list.add("10,20,30");
	list.add("40,50,60");
	list.add("70,80,90");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 450.0");
	ml.execute(script);
}
 
Example #17
Source File: AtomInteraction.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for a row of pairwise atom interactions. 
 * The schema is used to create a Dataset<Row> from the row information.
 * 
 * @return schema for dataset
 */
public static StructType getPairInteractionSchema() {
	int length = InteractionCenter.getLength();
	StructField[] sf = new StructField[2 * length + 2];

	int index = 0;
	sf[index++] = DataTypes.createStructField("pdbId", DataTypes.StringType, false);

	// copy schema info for query atom
	System.arraycopy(InteractionCenter.getStructFields(0), 0, sf, index, length);
	index += length;

	// copy schema info for interacting atoms and their distance
	System.arraycopy(InteractionCenter.getStructFields(1), 0, sf, index, length);
	index += length;
	sf[index++] = DataTypes.createStructField("distance1", DataTypes.FloatType, true);

	return new StructType(sf);
}
 
Example #18
Source File: TestProtobufTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void translateMultiple() throws Exception {
  String descPath = TestProtobufTranslator.class.getResource(MULTIPLE_EXAMPLE).getPath();

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "protobuf");
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." + 
      ProtobufSchema.DESCRIPTOR_FILEPATH_CONFIG, descPath);
  configMap.put(ProtobufTranslator.SCHEMA_CONFIG + "." +
      ProtobufSchema.DESCRIPTOR_MESSAGE_CONFIG, "OtherExample");
  Config config = ConfigFactory.parseMap(configMap);

  ProtobufTranslator translator = new ProtobufTranslator();
  assertNoValidationFailures(translator, config);
  translator.configure(config);

  byte[] key = "foo".getBytes();
  byte[] payload = Files.readAllBytes(MULTIPLE_UNCOMPRESSED.toPath());

  Row raw = TestingMessageFactory.get(key, DataTypes.BinaryType, payload, DataTypes.BinaryType);
  Iterable<Row> results = translator.translate(raw);

  assertThat(results.iterator().hasNext(), is(true));
  Row row = results.iterator().next();
  assertThat(row.getString(0), is("other"));
}
 
Example #19
Source File: TypeCastStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private DataType mapDataType(List<StructField> datasetFields, String column, String typeConfig) {

        DataType currentDatatype = getCurrentDataType(datasetFields, column);

        // when typeConfig is null (no config for this column), return the current DataType
        if(typeConfig == null) {
            return currentDatatype;
        }

        switch (typeConfig) {
            case "integer":
                return DataTypes.IntegerType;
            case "long":
                return DataTypes.LongType;
            case "double":
                return DataTypes.DoubleType;
            case "boolean":
                return DataTypes.BooleanType;
            case "date":
                return DataTypes.DateType;
            case "timestamp":
                return DataTypes.TimestampType;
            default:
                return DataTypes.StringType;
        }
    }
 
Example #20
Source File: TestRowUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testToRowValueDate() {
  DataType field = DataTypes.DateType;

  DateTime dateObj = DateTime.parse("2017-01-01T00:00:00"); // Pass-thru the TZ
  Date sqlDate = new Date(dateObj.getMillis());

  assertEquals("Invalid Long", sqlDate, RowUtils.toRowValue(dateObj.getMillis(), field));
  assertEquals("Invalid String", sqlDate, RowUtils.toRowValue("2017-001", field)); // ISO Date format
  assertEquals("Invalid Date", sqlDate, RowUtils.toRowValue(dateObj.toDate(), field));
  assertEquals("Invalid DateTime", sqlDate, RowUtils.toRowValue(dateObj, field));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage(CoreMatchers.containsString("Invalid or unrecognized input format"));
  RowUtils.toRowValue(123, field);
}
 
Example #21
Source File: TestConfigurationDataTypes.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetSparkDataTypeValid() {
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DECIMAL), new DecimalType());
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal(38,38)"), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType("decimal ( 38 , 38 ) "), new DecimalType(38,38));
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.STRING), DataTypes.StringType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.FLOAT), DataTypes.FloatType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DOUBLE), DataTypes.DoubleType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BYTE), DataTypes.ByteType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.SHORT), DataTypes.ShortType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.INT), DataTypes.IntegerType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.LONG), DataTypes.LongType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BOOLEAN), DataTypes.BooleanType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.BINARY), DataTypes.BinaryType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.DATE), DataTypes.DateType);
  assertEquals(ConfigurationDataTypes.getSparkDataType(ConfigurationDataTypes.TIMESTAMP), DataTypes.TimestampType);
}
 
Example #22
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #23
Source File: TestDelimitedSerializer.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testDelimitedSerialization() {
  List<StructField> fields = Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.StringType, true),
      DataTypes.createStructField("field2", DataTypes.IntegerType, true),
      DataTypes.createStructField("field3", DataTypes.BooleanType, true)
  );
  Row row = new RowWithSchema(DataTypes.createStructType(fields), "hello", 1, false);
  
  Map<String, String> configs = Maps.newHashMap();
  configs.put(DelimitedSerializer.FIELD_DELIMITER_CONFIG_NAME, "||");
  Serializer<Row> serializer = new DelimitedSerializer();
  serializer.configure(configs, false);
  
  byte[] serialized = serializer.serialize("test", row);
  serializer.close();
  
  assertEquals(new String(serialized), "hello||1||false");
}
 
Example #24
Source File: TestMorphlineTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test (expected = MorphlineRuntimeException.class)
public void noRecordReturned() throws Exception {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "no-return");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("int", "str", "float"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string", "float"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  Row raw = TestingMessageFactory.get("The Key", DataTypes.StringType, 
      "The Message", DataTypes.StringType);
  translator.translate(raw);
}
 
Example #25
Source File: TestAvroUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void toTypeSchemaStringNotNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.StringType, false);

  assertEquals("Invalid type", Schema.Type.STRING, schema.getType());

  //System.out.println(schema.toString(true));
}
 
Example #26
Source File: TestDelimitedTranslator.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullMissing() {
  String delimited = "val1 2 34";
  
  Config config = ConfigFactory.empty()
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME,
          ConfigValueFactory.fromAnyRef("flat"))
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
          ConfigValueFactory.fromIterable(
              Lists.newArrayList("field1", "field2", "field3", "field4", "field5")))
      .withValue(DelimitedTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
          ConfigValueFactory.fromIterable(
              Lists.newArrayList("string", "int", "long", "int", "boolean")))
      .withValue(DelimitedTranslator.DELIMITER_CONFIG_NAME, ConfigValueFactory.fromAnyRef(" "));

  DelimitedTranslator t = new DelimitedTranslator();
  assertNoValidationFailures(t, config);
  t.configure(config);
  Row raw = TestingMessageFactory.get("testkey", DataTypes.StringType, delimited, DataTypes.StringType);
  Row r = t.translate(raw).iterator().next();
  assertEquals(r.length(), 5);
  assertEquals(r.get(0), "val1");
  assertEquals(r.get(1), 2);
  assertEquals(r.get(2), 34L);
  assertEquals(r.get(3), null);
  assertEquals(r.get(4), null);
}
 
Example #27
Source File: TestAvroUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void toTypeSchemaStringNullable() throws Exception {
  Schema schema = AvroUtils.typeFor(DataTypes.StringType);

  assertEquals("Invalid type", Schema.Type.UNION, schema.getType());
  assertEquals("Invalid union size", 2, schema.getTypes().size());

  for (Schema s : schema.getTypes()) {
    assertThat("Invalid union types", s.getType(), anyOf(is(Schema.Type.STRING), is(Schema.Type.NULL)));
  }

  //System.out.println(schema.toString(true));
}
 
Example #28
Source File: TestRowUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testToRowValueTimestamp() {
  DataType field = DataTypes.TimestampType;

  DateTime dateObj = DateTime.parse("2017-01-01T00:00:00"); // Pass-thru the TZ
  Timestamp sqlTimestamp = new Timestamp(dateObj.getMillis());

  assertEquals("Invalid Long", sqlTimestamp, RowUtils.toRowValue(dateObj.getMillis(), field));
  assertEquals("Invalid String", sqlTimestamp, RowUtils.toRowValue("2017-001", field)); // ISO Date format
  assertEquals("Invalid Date", sqlTimestamp, RowUtils.toRowValue(dateObj.toDate(), field));
  assertEquals("Invalid DateTime", sqlTimestamp, RowUtils.toRowValue(dateObj, field));

  // Test custom timestamp format parsing
  Map<RowUtils.RowValueMetadata, Object> metadataNull = Maps.newHashMap();
  Map<RowUtils.RowValueMetadata, Object> metadataEmpty = Maps.newHashMap();
  Map<RowUtils.RowValueMetadata, Object> metadataFormat = Maps.newHashMap();
  Set<String> empty = Sets.newHashSet();
  Set<String> formats = Sets.newHashSet();
  formats.add("yyyy-MM-dd HH:mm:ss.SSSSS");
  metadataNull.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, null);
  metadataEmpty.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, empty);
  metadataFormat.put(RowUtils.RowValueMetadata.TIMESTAMP_FORMATS, formats);
  assertEquals("Invalid null metadata", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, null));
  assertEquals("Invalid null format set in metadata", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, metadataNull));
  assertEquals("Invalid format set", sqlTimestamp, RowUtils.toRowValue("2017-01-01 00:00:00.00000", field, metadataFormat));
  assertEquals("Invalid empty format set", sqlTimestamp, RowUtils.toRowValue("2017-01-01T00:00:00", field, metadataEmpty));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage(CoreMatchers.containsString("Invalid or unrecognized input format"));
  RowUtils.toRowValue(123, field);
}
 
Example #29
Source File: TestInListDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
private static StructType createTestSchema() {
  return DataTypes.createStructType(Arrays.asList(
      DataTypes.createStructField("id", DataTypes.StringType, true),
      DataTypes.createStructField("descr", DataTypes.StringType, true),
      DataTypes.createStructField("value", DataTypes.IntegerType, true),
      DataTypes.createStructField("vdate", DataTypes.DateType, true))
  );
}
 
Example #30
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}