Java Code Examples for org.apache.spark.sql.types.StructType#fields()

The following examples show how to use org.apache.spark.sql.types.StructType#fields() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MLContextUtil.java    From systemds with Apache License 2.0 7 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example 2
Source File: UnaryTransformer.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
@Override
public StructType transformSchema(StructType structType) {
    String inputCol = getInputCol();
    String outputCol = getOutputCol();
    DataType inputType = structType.apply(inputCol).dataType();
    this.validateInputType(inputType);
    List<String> names = Arrays.asList(structType.fieldNames());
    Cond.require(!names.contains(outputCol), "The output column " + outputCol + " already exists in this schema!");
    List<StructField> fields = new ArrayList<>();
    for (int i = 0; i < structType.fields().length; i++) {
        fields.add(structType.fields()[i]);
    }
    DataType dt = getOutputDataType();
    fields.add(DataTypes.createStructField(outputCol, dt, isOutputDataTypeNullable()));
    return DataTypes.createStructType(fields);
}
 
Example 3
Source File: DBClientWrapper.java    From spark-data-sources with MIT License 6 votes vote down vote up
public static edb.common.Row sparkToDBRow(org.apache.spark.sql.Row row, StructType type) {
    edb.common.Row dbRow = new edb.common.Row();
    StructField[] fields = type.fields();
    for (int i = 0; i < type.size(); i++) {
        StructField sf = fields[i];
        if (sf.dataType() == DataTypes.StringType) {
            dbRow.addField(new edb.common.Row.StringField(sf.name(), row.getString(i)));
        } else if (sf.dataType() == DataTypes.DoubleType) {
            dbRow.addField(new edb.common.Row.DoubleField(sf.name(), row.getDouble(i)));
        } else if (sf.dataType() == DataTypes.LongType) {
            dbRow.addField(new edb.common.Row.Int64Field(sf.name(), row.getLong(i)));
        } else {
            // TODO: type leakage
        }
    }

    return dbRow;
}
 
Example 4
Source File: Reader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) {
  StructType partitionType = SparkSchemaUtil.convert(partitionSchema);
  StructField[] fields = partitionType.fields();

  this.types = new DataType[fields.length];
  this.positions = new int[types.length];
  this.javaTypes = new Class<?>[types.length];
  this.reusedRow = new GenericInternalRow(types.length);

  List<PartitionField> partitionFields = spec.fields();
  for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) {
    this.types[rowIndex] = fields[rowIndex].dataType();

    int sourceId = partitionSchema.columns().get(rowIndex).fieldId();
    for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) {
      PartitionField field = spec.fields().get(specIndex);
      if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) {
        positions[rowIndex] = specIndex;
        javaTypes[rowIndex] = spec.javaClasses()[specIndex];
        break;
      }
    }
  }
}
 
Example 5
Source File: MLContextUtil.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}
 
Example 6
Source File: SchemaConverter.java    From geowave with Apache License 2.0 6 votes vote down vote up
public static SimpleFeatureType schemaToFeatureType(
    final StructType schema,
    final String typeName) {
  final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder();
  typeBuilder.setName(typeName);
  typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE);
  try {
    typeBuilder.setCRS(CRS.decode("EPSG:4326", true));
  } catch (final FactoryException e) {
    LOGGER.error(e.getMessage(), e);
  }

  final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder();

  for (final StructField field : schema.fields()) {
    final AttributeDescriptor attrDesc = attrDescFromStructField(attrBuilder, field);

    typeBuilder.add(attrDesc);
  }

  return typeBuilder.buildFeatureType();
}
 
Example 7
Source File: ExternalTableUtils.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void setPartitionColumnTypes (StructType dataSchema,int[] baseColumnMap, StructType tableSchema){

            int ncolumns = dataSchema.fields().length;
            int nPartitions = baseColumnMap.length;
            for (int i = 0; i < baseColumnMap.length; ++i) {
                String name = dataSchema.fields()[ncolumns - i - 1].name();
                org.apache.spark.sql.types.DataType type = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].dataType();
                boolean nullable = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].nullable();
                Metadata metadata = tableSchema.fields()[baseColumnMap[nPartitions - i - 1]].metadata();
                StructField field = new StructField(name, type, nullable, metadata);
                dataSchema.fields()[ncolumns - i - 1] = field;
            }
        }
 
Example 8
Source File: ColumnUtils.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
public static Metadata getMetadata(Dataset<Row> df, String colName) {
  StructType schema = df.schema();
  StructField[] fields = schema.fields();
  for (StructField field : fields) {
    // TODO check on case
    if (field.name().compareTo(colName) == 0) {
      return field.metadata();
    }
  }
  return null;
}
 
Example 9
Source File: UnsafeFixedWidthAggregationMap.java    From indexr with Apache License 2.0 5 votes vote down vote up
/**
 * @return true if UnsafeFixedWidthAggregationMap supports aggregation buffers with the given
 * schema, false otherwise.
 */
public static boolean supportsAggregationBufferSchema(StructType schema) {
    for (StructField field : schema.fields()) {
        if (!UnsafeRow.isMutable(field.dataType())) {
            return false;
        }
    }
    return true;
}
 
Example 10
Source File: DataFrames.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Create a datavec schema
 * from a struct type
 *
 * @param structType the struct type to create the schema from
 * @return the created schema
 */
public static Schema fromStructType(StructType structType) {
    Schema.Builder builder = new Schema.Builder();
    StructField[] fields = structType.fields();
    String[] fieldNames = structType.fieldNames();
    for (int i = 0; i < fields.length; i++) {
        String name = fields[i].dataType().typeName().toLowerCase();
        switch (name) {
            case "double":
                builder.addColumnDouble(fieldNames[i]);
                break;
            case "float":
                builder.addColumnFloat(fieldNames[i]);
                break;
            case "long":
                builder.addColumnLong(fieldNames[i]);
                break;
            case "int":
            case "integer":
                builder.addColumnInteger(fieldNames[i]);
                break;
            case "string":
                builder.addColumnString(fieldNames[i]);
                break;
            default:
                throw new RuntimeException("Unknown type: " + name);
        }
    }

    return builder.build();
}
 
Example 11
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
StructLikeInternalRow(StructType struct) {
  this.types = new DataType[struct.size()];
  StructField[] fields = struct.fields();
  for (int i = 0; i < fields.length; i += 1) {
    types[i] = fields[i].dataType();
  }
}
 
Example 12
Source File: MLContextTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
 
Example 13
Source File: FrameRDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Obtain column vector from DataFrame schema
 * 
 * @param dfschema schema as StructType
 * @param containsID if true, contains ID column
 * @return 0-based column index of vector column, -1 if no vector.
 */
private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) {
	int off = containsID ? 1 : 0;
	for( int i=off; i<dfschema.fields().length; i++ ) {
		StructField structType = dfschema.apply(i);
		if(structType.dataType() instanceof VectorUDT)
			return i-off;
	}
	
	return -1;
}
 
Example 14
Source File: MLContextConversionUtil.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * If the MatrixFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper MatrixFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata, if available
 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
	if (matrixMetadata == null) {
		return;
	}
	MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
	if (matrixFormat != null) {
		return;
	}
	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	StructField[] fields = schema.fields();
	MatrixFormat mf = null;
	if (hasID) {
		if (fields[1].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
		} else {
			mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
		}
	} else {
		if (fields[0].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR;
		} else {
			mf = MatrixFormat.DF_DOUBLES;
		}
	}

	if (mf == null) {
		throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat");
	}
	matrixMetadata.setMatrixFormat(mf);
}
 
Example 15
Source File: DBClientWrapper.java    From spark-data-sources with MIT License 5 votes vote down vote up
public static Schema sparkToDbSchema(StructType st) {
    Schema schema = new Schema();
    for (StructField sf: st.fields()) {
        if (sf.dataType() == DataTypes.StringType) {
            schema.addColumn(sf.name(), Schema.ColumnType.STRING);
        } else if (sf.dataType() == DataTypes.DoubleType) {
            schema.addColumn(sf.name(), Schema.ColumnType.DOUBLE);
        } else if (sf.dataType() == DataTypes.LongType) {
            schema.addColumn(sf.name(), Schema.ColumnType.INT64);
        } else {
            // TODO: type leakage
        }
    }
    return schema;
}
 
Example 16
Source File: ExternalTableUtils.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static void checkSchema(StructType tableSchema,
                               StructType dataSchema,
                               int[] partitionColumnMap,
                               String location) throws StandardException{


    StructField[] tableFields = tableSchema.fields();
    StructField[] dataFields = dataSchema.fields();

    if (tableFields.length != dataFields.length) {
        throw StandardException.newException(SQLState.INCONSISTENT_NUMBER_OF_ATTRIBUTE,
                tableFields.length, dataFields.length, location);
    }

    StructField[] partitionedTableFields = new StructField[tableSchema.fields().length];
    Set<Integer> partitionColumns = new HashSet<>();
    for (int pos : partitionColumnMap) {
        partitionColumns.add(pos);
    }
    int index = 0;
    for (int i = 0; i < tableFields.length; ++i) {
        if (!partitionColumns.contains(i)) {
            partitionedTableFields[index++] = tableFields[i];
        }
    }

    for (int i = 0; i < tableFields.length - partitionColumnMap.length; ++i) {

        String tableFiledTypeName = partitionedTableFields[i].dataType().typeName();
        String dataFieldTypeName = dataFields[i].dataType().typeName();
        if (!tableFiledTypeName.equals(dataFieldTypeName)){
            throw StandardException.newException(SQLState.INCONSISTENT_DATATYPE_ATTRIBUTES,
                    tableFields[i].name(),
                    tableFields[i].dataType().toString(),
                    dataFields[i].name(),
                    dataFields[i].dataType().toString(),location);
        }
    }
}
 
Example 17
Source File: SqlResultsWriter.java    From geowave with Apache License 2.0 4 votes vote down vote up
public void writeResults(String typeName) {
  if (typeName == null) {
    typeName = DEFAULT_TYPE_NAME;
    LOGGER.warn(
        "Using default type name (adapter id): '" + DEFAULT_TYPE_NAME + "' for SQL output");
  }

  final StructType schema = results.schema();
  final SimpleFeatureType featureType = SchemaConverter.schemaToFeatureType(schema, typeName);

  final SimpleFeatureBuilder sfBuilder = new SimpleFeatureBuilder(featureType);

  final FeatureDataAdapter featureAdapter = new FeatureDataAdapter(featureType);

  final DataStore featureStore = outputDataStore.createDataStore();
  final Index featureIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  featureStore.addType(featureAdapter, featureIndex);
  try (Writer writer = featureStore.createWriter(featureAdapter.getTypeName())) {

    final List<Row> rows = results.collectAsList();

    for (int r = 0; r < rows.size(); r++) {
      final Row row = rows.get(r);

      for (int i = 0; i < schema.fields().length; i++) {
        final StructField field = schema.apply(i);
        final Object rowObj = row.apply(i);
        if (rowObj != null) {
          if (field.name().equals("geom")) {
            final Geometry geom = (Geometry) rowObj;

            sfBuilder.set("geom", geom);
          } else if (field.dataType() == DataTypes.TimestampType) {
            final long millis = ((Timestamp) rowObj).getTime();
            final Date date = new Date(millis);

            sfBuilder.set(field.name(), date);
          } else {
            sfBuilder.set(field.name(), rowObj);
          }
        }
      }

      final SimpleFeature sf = sfBuilder.buildFeature("result-" + nf.format(r));

      writer.write(sf);
    }
  }
}
 
Example 18
Source File: SparkTypeToType.java    From iceberg with Apache License 2.0 4 votes vote down vote up
SparkTypeToType(StructType root) {
  this.root = root;
  // the root struct's fields use the first ids
  this.nextId = root.fields().length;
}
 
Example 19
Source File: SchemaIntrospectionApp.java    From net.jgp.labs.spark with Apache License 2.0 4 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Array to Dataframe (Dataset<Row>)")
      .master("local")
      .getOrCreate();

  StructType schema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "id",
          DataTypes.IntegerType,
          false),
      DataTypes.createStructField(
          "value-s",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "value-d",
          DataTypes.DoubleType,
          false),
      DataTypes.createStructField(
          "array",
          DataTypes.createArrayType(DataTypes.StringType, false),
          false),
      DataTypes.createStructField(
          "struct",
          DataTypes.createStructType(new StructField[] {
              DataTypes.createStructField(
                  "sid",
                  DataTypes.IntegerType,
                  false),
              DataTypes.createStructField(
                  "svalue",
                  DataTypes.StringType,
                  false) }),
          false),
      DataTypes.createStructField(
          "array-struct",
          DataTypes.createArrayType(
              DataTypes.createStructType(new StructField[] {
                  DataTypes.createStructField(
                      "asid",
                      DataTypes.IntegerType,
                      false),
                  DataTypes.createStructField(
                      "asvalue",
                      DataTypes.StringType,
                      false) })),
          false) });

  List<Row> rows = new ArrayList<>();
  for (int x = 0; x < 10; x++) {
    List<Row> subrows = new ArrayList<>();
    for (int y = 1000; y < 1003; y++) {
      subrows.add(RowFactory.create(y, "Sub " + y));
    }
    Row str = RowFactory.create(x * 5000, "Struct #" + x);
    String[] array =
        new String[] { "v" + (x * 100), "v" + (x * 100 + 1) };
    rows.add(
        RowFactory.create(x, "Value " + x, x / 4.0, array, str, subrows));
  }

  Dataset<Row> df = spark.createDataFrame(rows, schema);
  df.show(false);
  df.printSchema();

  StructType readSchema = df.schema();
  String[] fieldNames = readSchema.fieldNames();
  int i = 0;
  for (String fieldName : fieldNames) {
    log.info("Field #{}: '{}'", i++, fieldName);
  }
  log.info("Catalog: '{}'", readSchema.catalogString());
  StructField[] fields = readSchema.fields();
  i = 0;
  for (StructField field : fields) {
    log.info("DDL for field #{}: '{}'", i++, field.toDDL());
  }
}
 
Example 20
Source File: SparkRowConverterTest.java    From bunsen with Apache License 2.0 3 votes vote down vote up
/**
 * Recursively walks the schema to ensure there are no struct fields that are empty.
 */
private void checkNoEmptyStructs(StructType schema, String fieldName) {

  Assert.assertNotEquals("Struct field " + fieldName + " is empty",
      0,
      schema.fields().length);

  for (StructField field : schema.fields()) {

    if (field.dataType() instanceof StructType) {

      checkNoEmptyStructs((StructType) field.dataType(), field.name());

    } else if (field.dataType() instanceof ArrayType) {

      ArrayType arrayType = (ArrayType) field.dataType();

      if (arrayType.elementType() instanceof StructType) {

        if (!field.name().equals("contained")) {

          checkNoEmptyStructs((StructType) arrayType.elementType(), field.name());
        }
      }
    }
  }
}