Java Code Examples for org.apache.spark.sql.Dataset#schema()

The following examples show how to use org.apache.spark.sql.Dataset#schema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: MLContextUtil.java From systemds with Apache License 2.0

7 votes

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}

Example 2

Source File: CsvSourceTest.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Test
public void testGetSourceDataFromFactTable() {
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(CUBE_NAME);
    TableDesc fact = MetadataConverter.extractFactTable(cube);
    List<ColumnDesc> colDescs = Lists.newArrayList();
    Iterator<ColumnDesc> iterator = fact.columns().iterator();
    while (iterator.hasNext()) {
        colDescs.add(iterator.next());
    }

    NSparkCubingEngine.NSparkCubingSource cubingSource = new CsvSource().adaptToBuildEngine(NSparkCubingEngine.NSparkCubingSource.class);
    Dataset<Row> cubeDS = cubingSource.getSourceData(fact, ss, Maps.newHashMap());
    cubeDS.take(10);
    StructType schema = cubeDS.schema();
    for (int i = 0; i < colDescs.size(); i++) {
        StructField field = schema.fields()[i];
        Assert.assertEquals(field.name(), colDescs.get(i).columnName());
        Assert.assertEquals(field.dataType(), colDescs.get(i).dataType());
    }
}

Example 3

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

6 votes

/**
 * If the FrameFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper FrameFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param frameMetadata
 *            the frame metadata, if available
 */
public static void determineFrameFormatIfNeeded(Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
	FrameFormat frameFormat = frameMetadata.getFrameFormat();
	if (frameFormat != null) {
		return;
	}

	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	FrameFormat ff = hasID ? FrameFormat.DF_WITH_INDEX : FrameFormat.DF;
	frameMetadata.setFrameFormat(ff);
}

Example 4

Source File: HoodieReadClient.java From hudi with Apache License 2.0

6 votes

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
 *
 * @return a dataframe
 */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
  assertSqlContext();
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
      index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
  JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
      lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
  List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
      .map(keyFileTuple -> keyFileTuple._2().get()).collect();

  // record locations might be same for multiple keys, so need a unique list
  Set<String> uniquePaths = new HashSet<>(paths);
  Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
  StructType schema = originalDF.schema();
  JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
    HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
        row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
    return new Tuple2<>(key, row);
  });

  // Now, we need to further filter out, for only rows that match the supplied hoodie keys
  JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
  return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

Example 5

Source File: TranslateDeriver.java From envelope with Apache License 2.0

6 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  Dataset<Row> step = getStep(dependencies, fieldName);

  StructType translatedSchema = getTranslatedSchema(
      step.schema(), getTranslateFunction(translatorConfig).getProvidingSchema());

  JavaRDD<Row> translation = step.javaRDD().flatMap(
      new DeriverTranslateFunction(fieldName, translatorConfig));

  TranslationResults translationResults = new TranslationResults(
      translation, translatedSchema, step.schema());

  errored = translationResults.getErrored();

  return translationResults.getTranslated();
}

Example 6

Source File: MLContextUtil.java From systemds with Apache License 2.0

6 votes

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
	StructType schema = df.schema();
	StructField[] fields = schema.fields();
	if (fields == null) {
		return true;
	}
	for (StructField field : fields) {
		DataType dataType = field.dataType();
		if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType)
				&& (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT))
				&& (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
			// uncomment if we support arrays of doubles for matrices
			// if (dataType instanceof ArrayType) {
			// ArrayType arrayType = (ArrayType) dataType;
			// if (arrayType.elementType() == DataTypes.DoubleType) {
			// continue;
			// }
			// }
			return false;
		}
	}
	return true;
}

Example 7

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

6 votes

/**
 * If the FrameFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper FrameFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param frameMetadata
 *            the frame metadata, if available
 */
public static void determineFrameFormatIfNeeded(Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
	FrameFormat frameFormat = frameMetadata.getFrameFormat();
	if (frameFormat != null) {
		return;
	}

	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	FrameFormat ff = hasID ? FrameFormat.DF_WITH_INDEX : FrameFormat.DF;
	frameMetadata.setFrameFormat(ff);
}

Example 8

Source File: CsvSourceTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Test
public void testGetFlatTable() throws IOException {
    System.out.println(getTestConfig().getMetadataUrl());
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(CUBE_NAME);
    cleanupSegments(CUBE_NAME);
    DataModelDesc model = cube.getModel();
    CubeSegment segment = cubeMgr.appendSegment(cube, new SegmentRange.TSRange(dateToLong("2010-01-01"), dateToLong("2013-01-01")));
    Dataset<Row> ds = initFlatTable(segment);
    ds.show(10);
    StructType schema = ds.schema();

    SegmentInfo segmentInfo = MetadataConverter.getSegmentInfo(segment.getCubeInstance(), segment.getUuid(),
            segment.getName(), segment.getStorageLocationIdentifier());
    scala.collection.immutable.Map<String, String> map = BuildUtils.getColumnIndexMap(segmentInfo);
    for (StructField field : schema.fields()) {
        Assert.assertNotNull(model.findColumn(map.apply(field.name())));
    }

    for (LayoutEntity layoutEntity : MetadataConverter.extractEntityList2JavaList(cube)) {
        Set<Integer> dims = layoutEntity.getOrderedDimensions().keySet();
        Column[] modelCols = new Column[dims.size()];
        int index = 0;
        for (int id : dims) {
            modelCols[index] = new Column(String.valueOf(id));
            index++;
        }
        ds.select(modelCols).show(10);
    }
}

Example 9

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

5 votes

/**
 * If the MatrixFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper MatrixFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata, if available
 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
	if (matrixMetadata == null) {
		return;
	}
	MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
	if (matrixFormat != null) {
		return;
	}
	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	StructField[] fields = schema.fields();
	MatrixFormat mf = null;
	if (hasID) {
		if (fields[1].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
		} else {
			mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
		}
	} else {
		if (fields[0].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR;
		} else {
			mf = MatrixFormat.DF_DOUBLES;
		}
	}

	if (mf == null) {
		throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat");
	}
	matrixMetadata.setMatrixFormat(mf);
}

Example 10

Source File: MLContextTest.java From systemds with Apache License 2.0

5 votes

@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}

Example 11

Source File: ColumnUtils.java From net.jgp.labs.spark with Apache License 2.0

5 votes

public static Metadata getMetadata(Dataset<Row> df, String colName) {
  StructType schema = df.schema();
  StructField[] fields = schema.fields();
  for (StructField field : fields) {
    // TODO check on case
    if (field.name().compareTo(colName) == 0) {
      return field.metadata();
    }
  }
  return null;
}

Example 12

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

5 votes

/**
 * If the MatrixFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper MatrixFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param matrixMetadata
 *            the matrix metadata, if available
 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
	if (matrixMetadata == null) {
		return;
	}
	MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
	if (matrixFormat != null) {
		return;
	}
	StructType schema = dataFrame.schema();
	boolean hasID = false;
	try {
		schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
		hasID = true;
	} catch (IllegalArgumentException iae) {
	}

	StructField[] fields = schema.fields();
	MatrixFormat mf = null;
	if (hasID) {
		if (fields[1].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
		} else {
			mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
		}
	} else {
		if (fields[0].dataType() instanceof VectorUDT) {
			mf = MatrixFormat.DF_VECTOR;
		} else {
			mf = MatrixFormat.DF_DOUBLES;
		}
	}

	if (mf == null) {
		throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat");
	}
	matrixMetadata.setMatrixFormat(mf);
}

Example 13

Source File: MLContextTest.java From systemds with Apache License 2.0

5 votes

@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}

Example 14

Source File: SchemaIntrospectionApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Array to Dataframe (Dataset<Row>)")
      .master("local")
      .getOrCreate();

  StructType schema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "id",
          DataTypes.IntegerType,
          false),
      DataTypes.createStructField(
          "value-s",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "value-d",
          DataTypes.DoubleType,
          false),
      DataTypes.createStructField(
          "array",
          DataTypes.createArrayType(DataTypes.StringType, false),
          false),
      DataTypes.createStructField(
          "struct",
          DataTypes.createStructType(new StructField[] {
              DataTypes.createStructField(
                  "sid",
                  DataTypes.IntegerType,
                  false),
              DataTypes.createStructField(
                  "svalue",
                  DataTypes.StringType,
                  false) }),
          false),
      DataTypes.createStructField(
          "array-struct",
          DataTypes.createArrayType(
              DataTypes.createStructType(new StructField[] {
                  DataTypes.createStructField(
                      "asid",
                      DataTypes.IntegerType,
                      false),
                  DataTypes.createStructField(
                      "asvalue",
                      DataTypes.StringType,
                      false) })),
          false) });

  List<Row> rows = new ArrayList<>();
  for (int x = 0; x < 10; x++) {
    List<Row> subrows = new ArrayList<>();
    for (int y = 1000; y < 1003; y++) {
      subrows.add(RowFactory.create(y, "Sub " + y));
    }
    Row str = RowFactory.create(x * 5000, "Struct #" + x);
    String[] array =
        new String[] { "v" + (x * 100), "v" + (x * 100 + 1) };
    rows.add(
        RowFactory.create(x, "Value " + x, x / 4.0, array, str, subrows));
  }

  Dataset<Row> df = spark.createDataFrame(rows, schema);
  df.show(false);
  df.printSchema();

  StructType readSchema = df.schema();
  String[] fieldNames = readSchema.fieldNames();
  int i = 0;
  for (String fieldName : fieldNames) {
    log.info("Field #{}: '{}'", i++, fieldName);
  }
  log.info("Catalog: '{}'", readSchema.catalogString());
  StructField[] fields = readSchema.fields();
  i = 0;
  for (StructField field : fields) {
    log.info("DDL for field #{}: '{}'", i++, field.toDDL());
  }
}

Example 15

Source File: ColumnExploder.java From jpmml-evaluator-spark with GNU Affero General Public License v3.0

4 votes

@Override
public Dataset<Row> transform(Dataset<?> dataset){
	StructType schema = dataset.schema();

	StructType structSchema = getStructSchema(schema);

	Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol()));

	Dataset<Row> result = dataset.toDF();

	StructField[] fields = structSchema.fields();
	for(StructField field : fields){
		String name = field.name();

		Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name));

		result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn);
	}

	return result;
}