org.apache.spark.sql.types.DataType Java Examples

The following examples show how to use org.apache.spark.sql.types.DataType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextUtil.java    From systemds with Apache License 2.0 7 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example #2
Source File: TestRowUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testToRowValueMapNested() {
  DataType field = DataTypes.createMapType(DataTypes.StringType,
      DataTypes.createMapType(DataTypes.LongType, DataTypes.IntegerType, true)
  );

  Map<Object, Object> expectedInnerMap = Maps.newHashMap();
  expectedInnerMap.put(9L, 1);
  expectedInnerMap.put(8L, 2);

  Map<Object, Object> expectedOuterMap = Maps.newHashMap();
  expectedOuterMap.put("outer", expectedInnerMap);

  Map<Object, Object> innerMap = Maps.newHashMap();
  innerMap.put(9L, 1);
  innerMap.put(8L, 2);

  Map<Object, Object> outerMap = Maps.newHashMap();
  outerMap.put("outer", innerMap);

  assertEquals("Invalid list of values", expectedOuterMap, RowUtils.toRowValue(outerMap, field));
}
 
Example #3
Source File: TestRowUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testToRowValueBoolean() {
  DataType field = DataTypes.BooleanType;

  assertEquals("Invalid Boolean", true, RowUtils.toRowValue(true, field));
  assertEquals("Invalid 'true'", true, RowUtils.toRowValue("true", field));
  assertEquals("Invalid 'true'", true, RowUtils.toRowValue("TrUe", field));
  assertEquals("Invalid 'false'", false, RowUtils.toRowValue("false", field));
  assertEquals("Invalid 'false'", false, RowUtils.toRowValue("FaLsE", field));

  try {
    RowUtils.toRowValue(123, field);
    fail("Expected a RuntimeException for invalid type");
  } catch (RuntimeException e) {
    assertThat(e.getMessage(), CoreMatchers.containsString("Invalid or unrecognized input format"));
  }
}
 
Example #4
Source File: LongStreamV2.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void nextShortVector(DataType type, int items, ColumnVector columnVector, boolean[] isNull)
        throws IOException
{
    for (int i = 0, j = 0; i < items; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        if (isNull[i]) {
            columnVector.appendNull();
        }
        else {
            columnVector.appendShort((short)next());
        }
    }
}
 
Example #5
Source File: Reader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
  StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
  StructField[] fields = partitionType.fields();

  this.types = new DataType[fields.length];
  this.positions = new int[types.length];
  this.javaTypes = new Class<?>[types.length];
  this.reusedRow = new GenericInternalRow(types.length);

  List<PartitionField> partitionFields = spec.fields();
  for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
    this.types[rowIndex] = fields[rowIndex].dataType();

    int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
    for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
      PartitionField field = spec.fields().get(specIndex);
      if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
        positions[rowIndex] = specIndex;
        javaTypes[rowIndex] = spec.javaClasses()[specIndex];
        break;
      }
    }
  }
}
 
Example #6
Source File: BooleanStream.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Sets the vector element to true if the bit is set, skipping the null values.
 */
public void getSetBits(DataType type, int batchSize, ColumnVector columnVector, boolean[] isNull)
        throws IOException {
    for (int i = 0, j = 0; i < batchSize; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        if (isNull[i]) {
            columnVector.appendNull();
        }
        else {
            // read more data if necessary
            if (bitsInData == 0) {
                readByte();
            }
            // read bit
            columnVector.appendBoolean((data & HIGH_BIT_MASK) != 0);
            // mark bit consumed
            data <<= 1;
            bitsInData--;
        }
    }
}
 
Example #7
Source File: FileSystemInput.java    From envelope with Apache License 2.0 6 votes vote down vote up
private DataType getKeyDataType() {
  if (Arrays.asList(expectedSchema.fieldNames()).contains("key")) {
    DataType keyDataType = expectedSchema.fields()[expectedSchema.fieldIndex("key")].dataType();

    if (convertToClass(keyDataType) == null) {
      throw new RuntimeException("Translator for filesystem input's input format is not compatible"
          + " because it does not use a supported type for the 'key' field");
    }

    return keyDataType;
  }
  else {
    // If the translator doesn't want the key field we don't specify a type so as to signal
    // that we want the key to be discarded. This is important for the 'input-format' format
    // because it is not known at runtime what the data type of an input format key should be.
    // We don't do the same for the value field because that is mandatory and so the translator
    // would typically be specifying which data type to use.
    return null;
  }
}
 
Example #8
Source File: LocalWithSparkSessionTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private static DataType convertType(org.apache.kylin.metadata.datatype.DataType type) {
    if (type.isTimeFamily())
        return DataTypes.TimestampType;

    if (type.isDateTimeFamily())
        return DataTypes.DateType;

    if (type.isIntegerFamily())
        return DataTypes.LongType;

    if (type.isNumberFamily())
        return DataTypes.createDecimalType(19, 4);

    if (type.isStringFamily())
        return DataTypes.StringType;

    if (type.isBoolean())
        return DataTypes.BooleanType;

    throw new IllegalArgumentException("KAP data type: " + type + " can not be converted to spark's type.");
}
 
Example #9
Source File: BooleanStream.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Sets the vector element to true if the bit is set.
 */
public void getSetBits(DataType type, int batchSize, ColumnVector columnVector)
        throws IOException {
    for (int i = 0, j = 0; i < batchSize; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        // read more data if necessary
        if (bitsInData == 0) {
            readByte();
        }

        // read bit
        columnVector.appendBoolean((data & HIGH_BIT_MASK) != 0);

        // mark bit consumed
        data <<= 1;
        bitsInData--;
    }
}
 
Example #10
Source File: MLContextUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example #11
Source File: InstanceRelationWriter.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
private DataType getDataType(int type) {
    switch (type) {
        case LiteralType.BOOLEAN:
            return DataTypes.BooleanType;
        case LiteralType.STRING:
            return DataTypes.StringType;
        case LiteralType.FLOAT:
            return DataTypes.FloatType;
        case LiteralType.DOUBLE:
            return DataTypes.DoubleType;
        case LiteralType.INTEGER:
            return DataTypes.IntegerType;
        case LiteralType.LONG:
            return DataTypes.LongType;
        case LiteralType.DATETIME:
            // datetime not supported due to timezone issues with java.sql.Timestamp
            // check the InstanceAggregator for more info
            return DataTypes.StringType;
    }
    throw new NotImplementedException("Not able to write literal type " + type);
}
 
Example #12
Source File: ProtobufUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
/**
 * Construct a {@code Dataset} schema from a {@code Descriptor}
 * <p>
 * This iterates and recurses through a {@link com.google.protobuf.Descriptors.Descriptor} and produces a
 * {@link StructType} for {@link org.apache.spark.sql.Dataset<Row>}.
 * Protobuf {@code oneof} fields are flattened into discrete {@link StructField} instances.
 * <p>
 * This will pass the value of {@link Descriptors.FieldDescriptor#isRequired()} to the associated {@link StructField}.
 *
 * @param descriptor the Descriptor to convert
 * @return the converted StructType
 */
public static StructType buildSchema(Descriptors.Descriptor descriptor) {
  List<StructField> members = new ArrayList<>();
  List<Descriptors.FieldDescriptor> protoFields = descriptor.getFields();

  for (Descriptors.FieldDescriptor fieldDescriptor : protoFields) {
    DataType fieldType = convertType(fieldDescriptor);
     StructField structField = DataTypes.createStructField(fieldDescriptor.getName(), fieldType,
        !fieldDescriptor.isRequired());
    members.add(structField);
    LOG.debug("FieldDescriptor[{}] => StructField[{}] ", fieldDescriptor.getFullName(), structField);
  }

  if (members.isEmpty()) {
    throw new RuntimeException("No FieldDescriptors found");
  }

  return DataTypes.createStructType(members.toArray(new StructField[0]));
}
 
Example #13
Source File: AvroWithSparkSchemaVisitor.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static <T> T visitArray(DataType type, Schema array, AvroWithSparkSchemaVisitor<T> visitor) {
  if (array.getLogicalType() instanceof LogicalMap || type instanceof MapType) {
    Preconditions.checkState(
        AvroSchemaUtil.isKeyValueSchema(array.getElementType()),
        "Cannot visit invalid logical map type: %s", array);
    Preconditions.checkArgument(type instanceof MapType, "Invalid map: %s is not a map", type);
    MapType map = (MapType) type;
    List<Schema.Field> keyValueFields = array.getElementType().getFields();
    return visitor.map(map, array,
        visit(map.keyType(), keyValueFields.get(0).schema(), visitor),
        visit(map.valueType(), keyValueFields.get(1).schema(), visitor));

  } else {
    Preconditions.checkArgument(type instanceof ArrayType, "Invalid array: %s is not an array", type);
    ArrayType list = (ArrayType) type;
    return visitor.array(list, array, visit(list.elementType(), array.getElementType(), visitor));
  }
}
 
Example #14
Source File: LongStreamV1.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void nextShortVector(DataType type, int items, ColumnVector columnVector, boolean[] isNull)
        throws IOException {
    for (int i = 0, j = 0; i < items; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        if (isNull[i]) {
            columnVector.appendNull();
        }
        else {
            columnVector.appendShort((short)next());
        }
    }
}
 
Example #15
Source File: LongStreamV1.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void nextLongVector(DataType type, int items, ColumnVector columnVector, boolean[] isNull)
        throws IOException
{
    for (int i = 0, j = 0; i < items; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        if (isNull[i]) {
            columnVector.appendNull();
        }
        else {
            columnVector.appendLong(next());
        }
    }
}
 
Example #16
Source File: KVPTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Override
public Validations getValidations() {
  return Validations.builder()
      .mandatoryPath(KVP_DELIMITER_CONFIG_NAME, ConfigValueType.STRING)
      .mandatoryPath(FIELD_DELIMITER_CONFIG_NAME, ConfigValueType.STRING)
      .mandatoryPath(SCHEMA_CONFIG, ConfigValueType.OBJECT)
      .add(new SupportedFieldTypesValidation(SCHEMA_CONFIG, 
          new HashSet<DataType>(Arrays.asList(new DecimalType(),    DataTypes.StringType,
                                              DataTypes.FloatType,  DataTypes.DoubleType,
                                              DataTypes.ShortType,  DataTypes.IntegerType,
                                              DataTypes.LongType,   DataTypes.BooleanType,
                                              DataTypes.BinaryType, DataTypes.DateType,
                                              DataTypes.TimestampType))))
      .optionalPath(TIMESTAMP_FORMAT_CONFIG_NAME, ConfigValueType.LIST)
      .handlesOwnValidationPath(SCHEMA_CONFIG)
      .build();
}
 
Example #17
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void convertToRowMissingColumnNotNullable(
    final @Mocked RowUtils utils
) throws Exception {

  Record record = new Record();
  record.put("foo", "one");

  StructType schema = DataTypes.createStructType(Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.StringType, false))
  );

  try {
    MorphlineUtils.convertToRow(schema, record);
    fail("Did not throw a RuntimeException");
  } catch (Exception e) {
    assertThat(e.getMessage(), CoreMatchers.containsString("DataType cannot contain 'null'"));
  }

  new Verifications() {{
    RowUtils.toRowValue(any, (DataType) any); times = 0;
  }};
}
 
Example #18
Source File: SparkArithmeticOperator.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (opKind == PLUS)
        return leftExpr.plus(rightExpr);
    else if (opKind == MINUS)
        return leftExpr.minus(rightExpr);
    else if (opKind == TIMES)
        return leftExpr.multiply(rightExpr);
    else if (opKind == DIVIDE)
        return leftExpr.divide(rightExpr);
    else
        throw new UnsupportedOperationException();
}
 
Example #19
Source File: FileSystemInput.java    From envelope with Apache License 2.0 6 votes vote down vote up
private DataType getValueDataType() {
  DataType valueDataType;

  if (Arrays.asList(expectedSchema.fieldNames()).contains(Translator.VALUE_FIELD_NAME)) {
    valueDataType = expectedSchema.fields()[expectedSchema.fieldIndex(Translator.VALUE_FIELD_NAME)].dataType();
  }
  else {
    // In the rare situation that the translator does not expect a specific data type for the
    // value column then we are in a tricky situation because there is no way to inspect the
    // input format class to see what data types it will return. Instead, we compromise and assume
    // it is text, since it seems reasonable that a schemaless translator would probably be used
    // with the input-format format so that raw lines of text from the file could be retrieved.
    valueDataType = DataTypes.StringType;
  }

  if (convertToClass(valueDataType) == null) {
    throw new RuntimeException("Translator for filesystem input's input format is not compatible"
        + " because it does not use a supported type for the '" + Translator.VALUE_FIELD_NAME + "' field");
  }

  return valueDataType;
}
 
Example #20
Source File: DelimitedTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Override
public Iterable<Row> translate(Row message) {
  String value = message.getAs(Translator.VALUE_FIELD_NAME);

  String[] stringValues = value.split((delimiterRegex) ?
                          delimiter : Pattern.quote(delimiter), schema.length());
  values.clear();

  for (int valuePos = 0; valuePos < schema.length(); valuePos++) {
    Object rowVal = null; 
    if (valuePos < stringValues.length) {
      String fieldValue = stringValues[valuePos];
      DataType fieldType = schema.fields()[valuePos].dataType();

      if (fieldValue.length() > 0) {
        rowVal = RowUtils.toRowValue(fieldValue, fieldType, rowValueMetadata);
      }
    }
    values.add(rowVal); 
  }

  Row row = new RowWithSchema(schema, values.toArray());

  return Collections.singleton(row);
}
 
Example #21
Source File: LongStreamV1.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public void nextIntVector(DataType type, int items, ColumnVector columnVector, boolean[] isNull)
        throws IOException {
    for (int i = 0, j = 0; i < items; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        if (isNull[i]) {
            columnVector.appendNull();
        }
        else {
            columnVector.appendInt((int)next());
        }
    }
}
 
Example #22
Source File: TestMorphlineUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void convertToRowValidNullValue(
    final @Mocked RowUtils utils
) throws Exception {

  Record record = new Record();
  record.put("field1", null);

  StructType schema = DataTypes.createStructType(Lists.newArrayList(
      DataTypes.createStructField("field1", DataTypes.StringType, true))
  );

  assertEquals("Invalid conversion", null, MorphlineUtils.convertToRow(schema, record).get(0));

  new Verifications() {{
    RowUtils.toRowValue(any, (DataType) any); times = 0;
  }};
}
 
Example #23
Source File: TypeCastStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private DataType getCurrentDataType(List<StructField> datasetFields, String column) {

        // search current datatype
        for(StructField sf : datasetFields) {
            if(sf.name().equals(column)) {
                return sf.dataType();
            }
        }

        return null;
    }
 
Example #24
Source File: TestRowUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testToRowValueNull() {
  DataType field = DataTypes.NullType;

  assertEquals("Invalid NULL", null, RowUtils.toRowValue(null, field));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage("Invalid or unrecognized input format");
  RowUtils.toRowValue(ByteBuffer.allocate(1), field);
}
 
Example #25
Source File: DefinitionToSparkVisitor.java    From bunsen with Apache License 2.0 5 votes vote down vote up
@Override
public HapiConverter<DataType> visitComposite(String elementName,
    String elementPath, String baseType,
    String elementTypeUrl, List<StructureField<HapiConverter<DataType>>> children) {

  String recordName = DefinitionVisitorsUtil.recordNameFor(elementPath);
  String recordNamespace = DefinitionVisitorsUtil.namespaceFor(basePackage, elementTypeUrl);
  String fullName = recordNamespace + "." + recordName;

  HapiConverter<DataType> converter = visitedConverters.get(fullName);

  if (converter == null) {
    StructField[] fields = children.stream()
        .map(entry -> new StructField(entry.fieldName(),
            entry.result().getDataType(),
            true,
            Metadata.empty()))
        .toArray(StructField[]::new);

    converter = new HapiCompositeToSparkConverter(baseType,
        children, new StructType(fields), fhirSupport);

    visitedConverters.put(fullName, converter);
  }

  return converter;

}
 
Example #26
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
StructLikeInternalRow(StructType struct) {
  this.types = new DataType[struct.size()];
  StructField[] fields = struct.fields();
  for (int i = 0; i < fields.length; i += 1) {
    types[i] = fields[i].dataType();
  }
}
 
Example #27
Source File: LongStreamDwrf.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public void nextIntVector(DataType type, int items, ColumnVector columnVector)
        throws IOException
{
    for (int i = 0; i < items; i++) {
        columnVector.appendInt((int)next());
    }
}
 
Example #28
Source File: TestHelpers.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static void assertEquals(String context, DataType type, Object expected, Object actual) {
  if (expected == null && actual == null) {
    return;
  }

  if (type instanceof StructType) {
    Assert.assertTrue("Expected should be an InternalRow: " + context,
        expected instanceof InternalRow);
    Assert.assertTrue("Actual should be an InternalRow: " + context,
        actual instanceof InternalRow);
    assertEquals(context, (StructType) type, (InternalRow) expected, (InternalRow) actual);

  } else if (type instanceof ArrayType) {
    Assert.assertTrue("Expected should be an ArrayData: " + context,
        expected instanceof ArrayData);
    Assert.assertTrue("Actual should be an ArrayData: " + context,
        actual instanceof ArrayData);
    assertEquals(context, (ArrayType) type, (ArrayData) expected, (ArrayData) actual);

  } else if (type instanceof MapType) {
    Assert.assertTrue("Expected should be a MapData: " + context,
        expected instanceof MapData);
    Assert.assertTrue("Actual should be a MapData: " + context,
        actual instanceof MapData);
    assertEquals(context, (MapType) type, (MapData) expected, (MapData) actual);

  } else if (type instanceof BinaryType) {
    assertEqualBytes(context, (byte[]) expected, (byte[]) actual);
  } else {
    Assert.assertEquals("Value should match expected: " + context, expected, actual);
  }
}
 
Example #29
Source File: DoubleStream.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public void nextVector(DataType type, int items, ColumnVector columnVector)
        throws IOException
{
    for (int i = 0, j = 0; i < items; i++) {
        while (columnVector.isNullAt(i+j)) {
            columnVector.appendNull();
            j++;
        }
        columnVector.appendDouble(next());
    }
}
 
Example #30
Source File: DefinitionToSparkVisitor.java    From bunsen with Apache License 2.0 5 votes vote down vote up
HapiCompositeToSparkConverter(String elementType,
    List<StructureField<HapiConverter<DataType>>> children,
    StructType structType,
    FhirConversionSupport fhirSupport,
    String extensionUrl) {

  super(elementType, children, structType, fhirSupport, extensionUrl);
}