Java Code Examples for org.apache.spark.sql.types.StructField#dataType()

The following examples show how to use org.apache.spark.sql.types.StructField#dataType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MLContextUtil.java    From systemds with Apache License 2.0 7 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example 2
Source File: DBClientWrapper.java    From spark-data-sources with MIT License 6 votes vote down vote up
public static edb.common.Row sparkToDBRow(org.apache.spark.sql.Row row, StructType type) {
    edb.common.Row dbRow = new edb.common.Row();
    StructField[] fields = type.fields();
    for (int i = 0; i < type.size(); i++) {
        StructField sf = fields[i];
        if (sf.dataType() == DataTypes.StringType) {
            dbRow.addField(new edb.common.Row.StringField(sf.name(), row.getString(i)));
        } else if (sf.dataType() == DataTypes.DoubleType) {
            dbRow.addField(new edb.common.Row.DoubleField(sf.name(), row.getDouble(i)));
        } else if (sf.dataType() == DataTypes.LongType) {
            dbRow.addField(new edb.common.Row.Int64Field(sf.name(), row.getLong(i)));
        } else {
            // TODO: type leakage
        }
    }

    return dbRow;
}
 
Example 3
Source File: SchemaConverter.java    From geowave with Apache License 2.0 6 votes vote down vote up
private static AttributeDescriptor attrDescFromStructField(
    final AttributeTypeBuilder attrBuilder,
    final StructField field) {
  if (field.name().equals("geom")) {
    return attrBuilder.binding(Geometry.class).nillable(false).buildDescriptor("geom");
  }
  if (field.dataType() == DataTypes.StringType) {
    return attrBuilder.binding(String.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.DoubleType) {
    return attrBuilder.binding(Double.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.FloatType) {
    return attrBuilder.binding(Float.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.LongType) {
    return attrBuilder.binding(Long.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.IntegerType) {
    return attrBuilder.binding(Integer.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.BooleanType) {
    return attrBuilder.binding(Boolean.class).buildDescriptor(field.name());
  } else if (field.dataType() == DataTypes.TimestampType) {
    return attrBuilder.binding(Date.class).buildDescriptor(field.name());
  }

  return null;
}
 
Example 4
Source File: SimpleFeatureMapper.java    From geowave with Apache License 2.0 6 votes vote down vote up
@Override
public Row call(final SimpleFeature feature) throws Exception {
  final Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
    final Object fieldObj = feature.getAttribute(i);
    if (fieldObj != null) {
      final StructField structField = schema.apply(i);
      if (structField.name().equals("geom")) {
        fields[i] = fieldObj;
      } else if (structField.dataType() == DataTypes.TimestampType) {
        fields[i] = new Timestamp(((Date) fieldObj).getTime());
      } else if (structField.dataType() != null) {
        fields[i] = fieldObj;
      } else {
        LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
      }
    }
  }

  return new GenericRowWithSchema(fields, schema);
}
 
Example 5
Source File: MLContextUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example 6
Source File: IndexRUtil.java    From indexr with Apache License 2.0 6 votes vote down vote up
public static SegmentSchema sparkSchemaToIndexRSchema(List<StructField> sparkSchema, IsIndexed isIndexed) {
    List<ColumnSchema> columns = new ArrayList<>();
    for (StructField f : sparkSchema) {
        SQLType type;
        if (f.dataType() instanceof IntegerType) {
            type = SQLType.INT;
        } else if (f.dataType() instanceof LongType) {
            type = SQLType.BIGINT;
        } else if (f.dataType() instanceof FloatType) {
            type = SQLType.FLOAT;
        } else if (f.dataType() instanceof DoubleType) {
            type = SQLType.DOUBLE;
        } else if (f.dataType() instanceof StringType) {
            type = SQLType.VARCHAR;
        } else if (f.dataType() instanceof DateType) {
            type = SQLType.DATE;
        } else if (f.dataType() instanceof TimestampType) {
            type = SQLType.DATETIME;
        } else {
            throw new IllegalStateException("Unsupported type: " + f.dataType());
        }
        columns.add(new ColumnSchema(f.name(), type, isIndexed.apply(f.name())));
    }
    return new SegmentSchema(columns);
}
 
Example 7
Source File: SchemaConverterTest.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the type of a nested field.
 */
DataType getField(DataType dataType, boolean isNullable, String... names) {

  StructType schema = dataType instanceof ArrayType
      ? (StructType) ((ArrayType) dataType).elementType()
      : (StructType) dataType;

  StructField field = Arrays.stream(schema.fields())
      .filter(sf -> sf.name().equalsIgnoreCase(names[0]))
      .findFirst()
      .get();

  DataType child = field.dataType();

  // Recurse through children if there are more names.
  if (names.length == 1) {

    // Check the nullability.
    Assert.assertEquals("Unexpected nullability of field " + field.name(),
        isNullable,
        field.nullable());

    return child;
  } else {
    return getField(child, isNullable, Arrays.copyOfRange(names, 1, names.length));
  }
}
 
Example 8
Source File: DbPersistorSQLServer.java    From rdf2x with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDataFrame(String name, DataFrame df) {
    for (StructField field : df.schema().fields()) {
        String column = field.name();
        // convert booleans to integers to avoid error in Spark 1.6.2
        // "Cannot specify a column width on data type bit."
        if (field.dataType() == DataTypes.BooleanType) {
            df = df.withColumn(column + TMP_SUFFIX, df.col(column).cast(DataTypes.IntegerType))
                    .drop(column)
                    .withColumnRenamed(column + TMP_SUFFIX, column);
        }
    }
    super.writeDataFrame(name, df);
}
 
Example 9
Source File: SparkMLEncoder.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
public DataField createDataField(FieldName name){
	StructType schema = getSchema();

	StructField field = schema.apply(name.getValue());

	org.apache.spark.sql.types.DataType sparkDataType = field.dataType();

	if(sparkDataType instanceof StringType){
		return createDataField(name, OpType.CATEGORICAL, DataType.STRING);
	} else

	if(sparkDataType instanceof IntegralType){
		return createDataField(name, OpType.CONTINUOUS, DataType.INTEGER);
	} else

	if(sparkDataType instanceof DoubleType){
		return createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE);
	} else

	if(sparkDataType instanceof BooleanType){
		return createDataField(name, OpType.CATEGORICAL, DataType.BOOLEAN);
	} else

	{
		throw new IllegalArgumentException("Expected string, integral, double or boolean data type, got " + sparkDataType.typeName() + " data type");
	}
}
 
Example 10
Source File: TypeCastStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private DataType getCurrentDataType(List<StructField> datasetFields, String column) {

        // search current datatype
        for(StructField sf : datasetFields) {
            if(sf.name().equals(column)) {
                return sf.dataType();
            }
        }

        return null;
    }
 
Example 11
Source File: SupportedFieldTypesValidation.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public ValidationResult validate(Config config) {

  for (StructField field : ComponentFactory.create(
      Schema.class, config.getConfig(this.path), true).getSchema().fields()) {
    boolean decimalMatch = (field.dataType() instanceof DecimalType &&
                            validationTypes.contains(new DecimalType()));

    if (!validationTypes.contains(field.dataType()) && !decimalMatch) {
      return new ValidationResult(this, Validity.INVALID,
        "Schema field type " + field.dataType().simpleString() + " is not supported by this component type.");
    }
  }
  return new ValidationResult(this, Validity.VALID, "Schema field types are valid for this component type.");
}
 
Example 12
Source File: TestHelpers.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static void assertEquals(String context, StructType struct,
                         InternalRow expected, InternalRow actual) {
  Assert.assertEquals("Should have correct number of fields", struct.size(), actual.numFields());
  for (int i = 0; i < actual.numFields(); i += 1) {
    StructField field = struct.fields()[i];
    DataType type = field.dataType();
    assertEquals(context + "." + field.name(), type, expected.get(i, type), actual.get(i, type));
  }
}
 
Example 13
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Obtain column vector from DataFrame schema
 * 
 * @param dfschema schema as StructType
 * @param containsID if true, contains ID column
 * @return 0-based column index of vector column, -1 if no vector.
 */
private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) {
	int off = containsID ? 1 : 0;
	for( int i=off; i<dfschema.fields().length; i++ ) {
		StructField structType = dfschema.apply(i);
		if(structType.dataType() instanceof VectorUDT)
			return i-off;
	}
	
	return -1;
}
 
Example 14
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Obtain column vector from DataFrame schema
 * 
 * @param dfschema schema as StructType
 * @param containsID if true, contains ID column
 * @return 0-based column index of vector column, -1 if no vector.
 */
private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) {
	int off = containsID ? 1 : 0;
	for( int i=off; i<dfschema.fields().length; i++ ) {
		StructField structType = dfschema.apply(i);
		if(structType.dataType() instanceof VectorUDT)
			return i-off;
	}
	
	return -1;
}
 
Example 15
Source File: DBClientWrapper.java    From spark-data-sources with MIT License 5 votes vote down vote up
public static Schema sparkToDbSchema(StructType st) {
    Schema schema = new Schema();
    for (StructField sf: st.fields()) {
        if (sf.dataType() == DataTypes.StringType) {
            schema.addColumn(sf.name(), Schema.ColumnType.STRING);
        } else if (sf.dataType() == DataTypes.DoubleType) {
            schema.addColumn(sf.name(), Schema.ColumnType.DOUBLE);
        } else if (sf.dataType() == DataTypes.LongType) {
            schema.addColumn(sf.name(), Schema.ColumnType.INT64);
        } else {
            // TODO: type leakage
        }
    }
    return schema;
}
 
Example 16
Source File: InputTranslatorCompatibilityValidation.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public ValidationResult validate(Config config) {
  Input input;
  Translator translator;
  try {
    input = ComponentFactory.create(Input.class, config.getConfig(DataStep.INPUT_TYPE), false);
    translator = ComponentFactory.create(
        Translator.class, config.getConfig(StreamingStep.TRANSLATOR_PROPERTY), false);
  }
  catch (Exception e) {
    return new ValidationResult(this, Validity.VALID,
        "Could not instantiate input and/or translator, so will not check if they" +
            " are compatible.");
  }

  String inputClass = input.getClass().getSimpleName();
  String translatorClass = translator.getClass().getSimpleName();

  if (translator instanceof UsesProvidedSchema && !(input instanceof DeclaresProvidingSchema)) {
    return new ValidationResult(this, Validity.INVALID,
        inputClass + " is not compatible with " + translatorClass +
        " because " + translatorClass + " requires " + inputClass + " to declare the schema that" +
        " it provides, but " + inputClass + " does not do so.");
  }

  if (input instanceof DeclaresProvidingSchema) {
    for (StructField translatorExpectingField : translator.getExpectingSchema().fields()) {
      boolean expectedFieldFound = false;
      for (StructField inputProvidingField : ((DeclaresProvidingSchema) input).getProvidingSchema().fields()) {
        if (translatorExpectingField.name().equals(inputProvidingField.name()) &&
            translatorExpectingField.dataType().equals(inputProvidingField.dataType())) {
          expectedFieldFound = true;
        }
      }

      if (!expectedFieldFound) {
        return new ValidationResult(this, Validity.INVALID,
            inputClass + " is not compatible with " + translatorClass + " because " +
                inputClass + " does not provide expected " + "field '" +
                translatorExpectingField.name() + "' with data type '" +
                translatorExpectingField.dataType() + "'");
      }
    }
  }

  return new ValidationResult(this, Validity.VALID, "Input and translator are compatible");
}
 
Example 17
Source File: ColumnExploder.java    From jpmml-evaluator-spark with GNU Affero General Public License v3.0 4 votes vote down vote up
private StructType getStructSchema(StructType schema){
	StructField structField = schema.apply(getStructCol());

	return (StructType)structField.dataType();
}
 
Example 18
Source File: SqlResultsWriter.java    From geowave with Apache License 2.0 4 votes vote down vote up
public void writeResults(String typeName) {
  if (typeName == null) {
    typeName = DEFAULT_TYPE_NAME;
    LOGGER.warn(
        "Using default type name (adapter id): '" + DEFAULT_TYPE_NAME + "' for SQL output");
  }

  final StructType schema = results.schema();
  final SimpleFeatureType featureType = SchemaConverter.schemaToFeatureType(schema, typeName);

  final SimpleFeatureBuilder sfBuilder = new SimpleFeatureBuilder(featureType);

  final FeatureDataAdapter featureAdapter = new FeatureDataAdapter(featureType);

  final DataStore featureStore = outputDataStore.createDataStore();
  final Index featureIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  featureStore.addType(featureAdapter, featureIndex);
  try (Writer writer = featureStore.createWriter(featureAdapter.getTypeName())) {

    final List<Row> rows = results.collectAsList();

    for (int r = 0; r < rows.size(); r++) {
      final Row row = rows.get(r);

      for (int i = 0; i < schema.fields().length; i++) {
        final StructField field = schema.apply(i);
        final Object rowObj = row.apply(i);
        if (rowObj != null) {
          if (field.name().equals("geom")) {
            final Geometry geom = (Geometry) rowObj;

            sfBuilder.set("geom", geom);
          } else if (field.dataType() == DataTypes.TimestampType) {
            final long millis = ((Timestamp) rowObj).getTime();
            final Date date = new Date(millis);

            sfBuilder.set(field.name(), date);
          } else {
            sfBuilder.set(field.name(), rowObj);
          }
        }
      }

      final SimpleFeature sf = sfBuilder.buildFeature("result-" + nf.format(r));

      writer.write(sf);
    }
  }
}
 
Example 19
Source File: SparkRowConverterTest.java    From bunsen with Apache License 2.0 3 votes vote down vote up
/**
 * Recursively walks the schema to ensure there are no struct fields that are empty.
 */
private void checkNoEmptyStructs(StructType schema, String fieldName) {

  Assert.assertNotEquals("Struct field " + fieldName + " is empty",
      0,
      schema.fields().length);

  for (StructField field : schema.fields()) {

    if (field.dataType() instanceof StructType) {

      checkNoEmptyStructs((StructType) field.dataType(), field.name());

    } else if (field.dataType() instanceof ArrayType) {

      ArrayType arrayType = (ArrayType) field.dataType();

      if (arrayType.elementType() instanceof StructType) {

        if (!field.name().equals("contained")) {

          checkNoEmptyStructs((StructType) arrayType.elementType(), field.name());
        }
      }
    }
  }
}