org.apache.spark.sql.types.StructType Java Examples

The following examples show how to use org.apache.spark.sql.types.StructType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}
 
Example #2
Source File: BigQueryDataSourceReader.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
public BigQueryDataSourceReader(
        TableInfo table,
        BigQueryClient bigQueryClient,
        BigQueryReadClientFactory bigQueryReadClientFactory,
        ReadSessionCreatorConfig readSessionCreatorConfig,
        Optional<String> globalFilter,
        Optional<StructType> schema) {
    this.table = table;
    this.tableId = table.getTableId();
    this.readSessionCreatorConfig = readSessionCreatorConfig;
    this.bigQueryClient = bigQueryClient;
    this.bigQueryReadClientFactory = bigQueryReadClientFactory;
    this.readSessionCreator = new ReadSessionCreator(readSessionCreatorConfig, bigQueryClient, bigQueryReadClientFactory);
    this.globalFilter = globalFilter;
    this.schema = schema;
}
 
Example #3
Source File: ParquetWithSparkSchemaVisitor.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static <T> List<T> visitFields(StructType struct, GroupType group,
                                       ParquetWithSparkSchemaVisitor<T> visitor) {
  StructField[] sFields = struct.fields();
  Preconditions.checkArgument(sFields.length == group.getFieldCount(),
      "Structs do not match: %s and %s", struct, group);
  List<T> results = Lists.newArrayListWithExpectedSize(group.getFieldCount());
  for (int i = 0; i < sFields.length; i += 1) {
    Type field = group.getFields().get(i);
    StructField sField = sFields[i];
    Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())),
        "Structs do not match: field %s != %s", field.getName(), sField.name());
    results.add(visitField(sField, field, visitor));
  }

  return results;
}
 
Example #4
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #5
Source File: ProtobufUtils.java    From envelope with Apache License 2.0 6 votes vote down vote up
/**
 * Retrieves and converts Protobuf fields from a Message.
 * <p>
 * If the field in the {@link com.google.protobuf.Descriptors.Descriptor} exists in the {@link Message}, the value is
 * retrieved and converted using {@link #getFieldValue(Descriptors.FieldDescriptor, Object, DataType)}.
 * Otherwise, the field value is {@code null}.
 * The extraction honors the order of the {@code Descriptor}.
 *
 * @param dsc the Protobuf Descriptor with all fields
 * @param msg the Message with the current field values
 * @param schema the Dataset schema derived from the Descriptor
 * @return a list of converted values
 */
public static List<Object> buildRowValues(Descriptors.Descriptor dsc, Message msg, StructType schema) {
  List<Object> values = new ArrayList<>();
  Object val;

  for (Descriptors.FieldDescriptor fd : dsc.getFields()) {
    if ( (!fd.isRepeated() && msg.hasField(fd)) || (fd.isRepeated() && msg.getRepeatedFieldCount(fd) > 0) ) {
      val = getFieldValue(fd, msg.getField(fd), schema.apply(fd.getName()).dataType());
    } else {
      LOG.trace("FieldDescriptor[{}] => not found", fd.getFullName());
      val = null;
    }
    values.add(val);
  }

  return values;
}
 
Example #6
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example #7
Source File: JavaBinarizerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example #8
Source File: SchemaUtilTest.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Test
public void testBuildHiveCreateTableQueryFromSparkDFSchema() {

  HiveWarehouseSessionState sessionState =
      HiveWarehouseBuilder
          .session(session)
          .userPassword(TEST_USER, TEST_PASSWORD)
          .hs2url(TEST_HS2_URL)
          .dbcp2Conf(TEST_DBCP2_CONF)
          .maxExecResults(TEST_EXEC_RESULTS_MAX)
          .defaultDB(TEST_DEFAULT_DB)
          .sessionStateForTest();
  HiveWarehouseSession hive = new MockHiveWarehouseSessionImpl(sessionState);

  HiveWarehouseSessionImpl.HIVE_WAREHOUSE_CONNECTOR_INTERNAL = "com.hortonworks.spark.sql.hive.llap.MockHiveWarehouseConnector";

  StructType schema = getSchema();
  String query = SchemaUtil.buildHiveCreateTableQueryFromSparkDFSchema(schema, "testDB", "testTable");
  System.out.println("create table query:" + query);
  assertTrue(hive.executeUpdate(query));
}
 
Example #9
Source File: SparkParquetReadersFlatDataBenchmark.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Benchmark
@Threads(1)
public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA);
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(PROJECTED_SCHEMA)
      .readSupport(new ParquetReadSupport())
      .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json())
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .callInit()
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example #10
Source File: TestMorphlineTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void getSchema() throws Exception {

  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "default");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("bar", "foo"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  StructType schema = translator.getProvidingSchema();

  Assert.assertEquals("Invalid number of SchemaFields", 2, schema.fields().length);
  Assert.assertEquals("Invalid DataType", DataTypes.IntegerType, schema.fields()[0].dataType());
  Assert.assertEquals("Invalid DataType", DataTypes.StringType, schema.fields()[1].dataType());
}
 
Example #11
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #12
Source File: RowProcessor.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaRDD<String> rdd) throws Exception {

  JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
    private static final long serialVersionUID = 5167089361335095997L;

    @Override
    public Row call(String msg) {
      Row row = RowFactory.create(msg);
      return row;
    }
  });
  // Create Schema
  StructType schema = DataTypes.createStructType(
      new StructField[] { DataTypes.createStructField("Message",
          DataTypes.StringType, true) });

  // Get Spark 2.0 session
  SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
      .getConf());
  Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
  msgDataFrame.show();
}
 
Example #13
Source File: HoodieReadClient.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
 *
 * @return a dataframe
 */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
  assertSqlContext();
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
      index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
  JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
      lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
  List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
      .map(keyFileTuple -> keyFileTuple._2().get()).collect();

  // record locations might be same for multiple keys, so need a unique list
  Set<String> uniquePaths = new HashSet<>(paths);
  Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
  StructType schema = originalDF.schema();
  JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
    HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
        row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
    return new Tuple2<>(key, row);
  });

  // Now, we need to further filter out, for only rows that match the supplied hoodie keys
  JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
  return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}
 
Example #14
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}
 
Example #15
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #16
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #17
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified");

	List<String> list = new ArrayList<>();
	list.add("2,2,2");
	list.add("3,3,3");
	list.add("4,4,4");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 27.0");
	ml.execute(script);
}
 
Example #18
Source File: MLContextTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example #19
Source File: ExternalTableUtils.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
public static StructType supportAvroDateType(StructType schema, String storedAs) {
    if (storedAs.toLowerCase().equals("a")) {
        for (int i = 0; i < schema.size(); i++) {
            StructField column = schema.fields()[i];
            if (column.dataType().equals(DataTypes.DateType)) {
                StructField replace = DataTypes.createStructField(column.name(), DataTypes.StringType, column.nullable(), column.metadata());
                schema.fields()[i] = replace;
            }
        }
    }
    return schema;
}
 
Example #20
Source File: SchemaUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
public static StructType subsetSchema(StructType schema, final List<String> fieldNames) {
  Seq<StructField> fieldSeq = schema.toTraversable().filter(new AbstractFunction1<StructField, Object>() {
    @Override
    public Object apply(StructField field) {
      return fieldNames.contains(field.name());
    }
  }).toSeq();

  StructType subset = DataTypes.createStructType(JavaConversions.seqAsJavaList(fieldSeq));

  return subset;
}
 
Example #21
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Add element indices as new column to DataFrame
 *
 * @param df input data frame
 * @param sparkSession the Spark Session
 * @param nameOfCol name of index column
 * @return new data frame
 */
public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
	StructField[] oldSchema = df.schema().fields();
	StructField[] newSchema = new StructField[oldSchema.length + 1];
	for(int i = 0; i < oldSchema.length; i++) {
		newSchema[i] = oldSchema[i];
	}
	newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
	// JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
	JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
	return sparkSession.createDataFrame(newRows, new StructType(newSchema));
}
 
Example #22
Source File: TestAvroUtils.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void toSchemaStructTypesNested() throws Exception {

  StructType input = DataTypes.createStructType(Lists.newArrayList(
      // Outer
      DataTypes.createStructField("Outer", DataTypes.createStructType(
          Lists.newArrayList(
              // Inner
              DataTypes.createStructField("Inner", DataTypes.createStructType(Lists.newArrayList(
                  DataTypes.createStructField("field1", DataTypes.IntegerType, false)
                  )),
                  false)
          )), false)
      )
  );

  Schema schema = AvroUtils.schemaFor(input);

  assertEquals("Invalid outer record name", "record0", schema.getName());
  assertEquals("Invalid outer field count", 1, schema.getFields().size());
  assertEquals("Invalid outer field name", "Outer", schema.getFields().get(0).name());
  assertEquals("Invalid outer field type", Schema.Type.RECORD, schema.getFields().get(0).schema().getType());

  assertEquals("Invalid inner record name", "record1", schema.getFields().get(0).schema().getName());
  assertEquals("Invalid inner field count", 1, schema.getFields().get(0).schema().getFields().size());
  assertEquals("Invalid inner field name", "Inner", schema.getFields().get(0).schema().getFields().get(0).name());
  assertEquals("Invalid inner field type", Schema.Type.RECORD, schema.getFields().get(0).schema().getFields().get(0).schema().getType());

  assertEquals("Invalid inner record name", "record2", schema.getFields().get(0).schema().getFields().get(0).schema().getName());
  assertEquals("Invalid nested field count", 1, schema.getFields().get(0).schema().getFields().get(0).schema().getFields().size());
  assertEquals("Invalid nested field name", "field1", schema.getFields().get(0).schema().getFields().get(0).schema().getFields().get(0).name());
  assertEquals("Invalid nested field type", Schema.Type.INT, schema.getFields().get(0).schema().getFields().get(0).schema().getFields().get(0).schema().getType());

  //System.out.println(schema.toString(true));
}
 
Example #23
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Add element indices as new column to DataFrame
 *
 * @param df input data frame
 * @param sparkSession the Spark Session
 * @param nameOfCol name of index column
 * @return new data frame
 */
public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
	StructField[] oldSchema = df.schema().fields();
	StructField[] newSchema = new StructField[oldSchema.length + 1];
	for(int i = 0; i < oldSchema.length; i++) {
		newSchema[i] = oldSchema[i];
	}
	newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
	// JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
	JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
	return sparkSession.createDataFrame(newRows, new StructType(newSchema));
}
 
Example #24
Source File: TranslateFunction.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public void receiveProvidedSchema(StructType providedSchema) {
  this.providedSchema = providedSchema;

  if (getTranslator() instanceof UsesProvidedSchema) {
    ((UsesProvidedSchema)getTranslator()).receiveProvidedSchema(providedSchema);
  }
}
 
Example #25
Source File: CsvDFSSource.java    From hudi with Apache License 2.0 5 votes vote down vote up
public CsvDFSSource(TypedProperties props,
    JavaSparkContext sparkContext,
    SparkSession sparkSession,
    SchemaProvider schemaProvider) {
  super(props, sparkContext, sparkSession, schemaProvider);
  this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
  if (schemaProvider != null) {
    sourceSchema = (StructType) SchemaConverters.toSqlType(schemaProvider.getSourceSchema())
        .dataType();
  } else {
    sourceSchema = null;
  }
}
 
Example #26
Source File: AbstractJavaEsSparkSQLTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> artistsAsDataset() throws Exception {
       // don't use the sc.textFile as it pulls in the Hadoop madness (2.x vs 1.x)
       Path path = Paths.get(testData.sampleArtistsDatUri());
       // because Windows... 
       List<String> lines = Files.readAllLines(path, StandardCharsets.ISO_8859_1);
	JavaRDD<String> data = sc.parallelize(lines);

	StructType schema = DataTypes
			.createStructType(new StructField[] {
					DataTypes.createStructField("id", DataTypes.IntegerType, false),
					DataTypes.createStructField("name", DataTypes.StringType, false),
					DataTypes.createStructField("url", DataTypes.StringType, true),
					DataTypes.createStructField("pictures", DataTypes.StringType, true),
					DataTypes.createStructField("time", DataTypes.TimestampType, true) });

	JavaRDD<Row> rowData = data.map(new Function<String, String[]>() {
		@Override
		public String[] call(String line) throws Exception {
			return line.split("\t");
		}
	}).map(new Function<String[], Row>() {
		@Override
		public Row call(String[] r) throws Exception {
			return RowFactory.create(Integer.parseInt(r[0]), r[1], r[2], r[3],
					new Timestamp(DatatypeConverter.parseDateTime(r[4]).getTimeInMillis()));
		}
	});

       return sqc.createDataFrame(rowData, schema);
}
 
Example #27
Source File: SparkTable.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public SparkTable(Table icebergTable, StructType requestedSchema) {
  this.icebergTable = icebergTable;
  this.requestedSchema = requestedSchema;

  if (requestedSchema != null) {
    // convert the requested schema to throw an exception if any requested fields are unknown
    SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema);
  }
}
 
Example #28
Source File: JavaDCTExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDCTExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
    RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
    RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  DCT dct = new DCT()
    .setInputCol("features")
    .setOutputCol("featuresDCT")
    .setInverse(false);

  Dataset<Row> dctDf = dct.transform(df);

  dctDf.select("featuresDCT").show(false);
  // $example off$

  spark.stop();
}
 
Example #29
Source File: KafkaInput.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public void receiveExpectedSchema(StructType expectedSchema) {
  this.expectedSchema = expectedSchema;

  List<String> fieldNames = Lists.newArrayList(KEY_FIELD_NAME, Translator.VALUE_FIELD_NAME);

  for (String fieldName : fieldNames) {
    if (Lists.newArrayList(expectedSchema.fieldNames()).contains(fieldName)) {
      DataType fieldDataType = expectedSchema.fields()[expectedSchema.fieldIndex(fieldName)].dataType();

      if (fieldDataType.equals(DataTypes.StringType)) {
        kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
      }
      else if (fieldDataType.equals(DataTypes.BinaryType)) {
        kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
      }
      else {
        throw new RuntimeException("Translator expects '" + fieldName + "' field to be of type '" + fieldDataType +
            "' but Kafka input only supports providing '" + fieldName + "' field as either string or   binary.");
      }
    }
    else {
      // If the translator doesn't expect the field then provide it as binary
      kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    }
  }
}
 
Example #30
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public SparkTable getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
  // TODO: if partitioning is non-null, the table is being created?
  // Get Iceberg table from options
  Configuration conf = new Configuration(SparkSession.active().sparkContext().hadoopConfiguration());
  Table icebergTable = getTableAndResolveHadoopConfiguration(options, conf);

  // Build Spark table based on Iceberg table, and return it
  return new SparkTable(icebergTable, schema);
}