org.apache.spark.sql.DataFrameWriter Java Examples

The following examples show how to use org.apache.spark.sql.DataFrameWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieJavaApp.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Setup configs for syncing to hive.
 */
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
  if (enableHiveSync) {
    LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
    writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
        .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
        .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
        .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
        .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
        .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
    if (nonPartitionedTable) {
      writer = writer
          .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
              NonPartitionedExtractor.class.getCanonicalName())
          .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "");
    } else if (useMultiPartitionKeys) {
      writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
          DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
          MultiPartKeysValueExtractor.class.getCanonicalName());
    } else {
      writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
    }
  }
  return writer;
}
 
Example #2
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException {
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<Record> expected = RandomData.generateList(tableSchema, 100, 0L);
  Dataset<Row> df = createDataset(expected, tableSchema);
  DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");

  writer.save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> actual = result.collectAsList();

  Assert.assertEquals("Result size should match expected", expected.size(), actual.size());
  for (int i = 0; i < expected.size(); i += 1) {
    assertEqualsSafe(tableSchema.asStruct(), expected.get(i), actual.get(i));
  }

  table.currentSnapshot().addedFiles().forEach(dataFile ->
      Assert.assertTrue(
          String.format(
              "File should have the parent directory %s, but has: %s.",
              expectedDataDir.getAbsolutePath(),
              dataFile.path()),
          URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
}
 
Example #3
Source File: TestHoodieSnapshotExporter.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public DataFrameWriter<Row> partition(Dataset<Row> source) {
  return source
      .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME)
      .repartition(new Column(PARTITION_NAME))
      .write()
      .partitionBy(PARTITION_NAME);
}
 
Example #4
Source File: HiveOutput.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {    
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = (doesAlignColumns) ? alignColumns(plan._2()) : plan._2();
    DataFrameWriter<Row> writer = mutation.write();

    if (partitionColumns != null) {
      writer = writer.partitionBy(partitionColumns);
    }

    if (options != null) {
      writer = writer.options(options);
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Hive output does not support mutation type: " + mutationType);
    }

    writer.insertInto(tableName);
  }
}
 
Example #5
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private void writeData(Iterable<Record> records, Schema schema, String location) throws IOException {
  Dataset<Row> df = createDataset(records, schema);
  DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");
  writer.save(location);
}
 
Example #6
Source File: FileSystemOutput.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = plan._2();

    DataFrameWriter<Row> writer = mutation.write();

    if (columns != null) {
      LOG.debug("Partitioning output");

      writer = writer.partitionBy(columns.toArray(new String[columns.size()]));
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support mutation type: " + mutationType);
    }

    switch (format) {
      case PARQUET_FORMAT:
        LOG.debug("Writing Parquet: {}", path);
        writer.parquet(path);
        break;
      case CSV_FORMAT:
        LOG.debug("Writing CSV: {}", path);
        writer.options(options).csv(path);
        break;
      case JSON_FORMAT:
        LOG.debug("Writing JSON: {}", path);
        writer.json(path);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support file format: " + format);
    }
  }
}
 
Example #7
Source File: HoodieSnapshotExporter.java    From hudi with Apache License 2.0 votes vote down vote up
DataFrameWriter<Row> partition(Dataset<Row> source);