Java Code Examples for org.apache.spark.sql.DataFrameWriter

The following examples show how to use org.apache.spark.sql.DataFrameWriter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hudi   Source File: HoodieJavaApp.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Setup configs for syncing to hive.
 */
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
  if (enableHiveSync) {
    LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
    writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
        .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
        .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
        .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
        .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
        .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
    if (nonPartitionedTable) {
      writer = writer
          .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
              NonPartitionedExtractor.class.getCanonicalName())
          .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "");
    } else if (useMultiPartitionKeys) {
      writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
          DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
          MultiPartKeysValueExtractor.class.getCanonicalName());
    } else {
      writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
    }
  }
  return writer;
}
 
Example 2
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 5 votes vote down vote up
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException {
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<Record> expected = RandomData.generateList(tableSchema, 100, 0L);
  Dataset<Row> df = createDataset(expected, tableSchema);
  DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");

  writer.save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> actual = result.collectAsList();

  Assert.assertEquals("Result size should match expected", expected.size(), actual.size());
  for (int i = 0; i < expected.size(); i += 1) {
    assertEqualsSafe(tableSchema.asStruct(), expected.get(i), actual.get(i));
  }

  table.currentSnapshot().addedFiles().forEach(dataFile ->
      Assert.assertTrue(
          String.format(
              "File should have the parent directory %s, but has: %s.",
              expectedDataDir.getAbsolutePath(),
              dataFile.path()),
          URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
}
 
Example 3
Source Project: hudi   Source File: TestHoodieSnapshotExporter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DataFrameWriter<Row> partition(Dataset<Row> source) {
  return source
      .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME)
      .repartition(new Column(PARTITION_NAME))
      .write()
      .partitionBy(PARTITION_NAME);
}
 
Example 4
Source Project: envelope   Source File: HiveOutput.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {    
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = (doesAlignColumns) ? alignColumns(plan._2()) : plan._2();
    DataFrameWriter<Row> writer = mutation.write();

    if (partitionColumns != null) {
      writer = writer.partitionBy(partitionColumns);
    }

    if (options != null) {
      writer = writer.options(options);
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Hive output does not support mutation type: " + mutationType);
    }

    writer.insertInto(tableName);
  }
}
 
Example 5
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 4 votes vote down vote up
private void writeData(Iterable<Record> records, Schema schema, String location) throws IOException {
  Dataset<Row> df = createDataset(records, schema);
  DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");
  writer.save(location);
}
 
Example 6
Source Project: envelope   Source File: FileSystemOutput.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = plan._2();

    DataFrameWriter<Row> writer = mutation.write();

    if (columns != null) {
      LOG.debug("Partitioning output");

      writer = writer.partitionBy(columns.toArray(new String[columns.size()]));
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support mutation type: " + mutationType);
    }

    switch (format) {
      case PARQUET_FORMAT:
        LOG.debug("Writing Parquet: {}", path);
        writer.parquet(path);
        break;
      case CSV_FORMAT:
        LOG.debug("Writing CSV: {}", path);
        writer.options(options).csv(path);
        break;
      case JSON_FORMAT:
        LOG.debug("Writing JSON: {}", path);
        writer.json(path);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support file format: " + format);
    }
  }
}
 
Example 7
Source Project: hudi   Source File: HoodieSnapshotExporter.java    License: Apache License 2.0 votes vote down vote up
DataFrameWriter<Row> partition(Dataset<Row> source);