org.apache.spark.sql.sources.v2.writer.DataSourceWriter Java Examples

The following examples show how to use org.apache.spark.sql.sources.v2.writer.DataSourceWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}

Example #2

Source File: IcebergSource.java From iceberg with Apache License 2.0

5 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
                                                 DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);

  Table table = findTable(options);

  Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
  List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
  if (!errors.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    sb.append("Cannot write incompatible dataframe to table with schema:\n")
        .append(table.schema()).append("\nProblems:");
    for (String error : errors) {
      sb.append("\n* ").append(error);
    }
    throw new IllegalArgumentException(sb.toString());
  }

  Optional<String> formatOption = options.get("iceberg.write.format");
  FileFormat format;
  if (formatOption.isPresent()) {
    format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH));
  } else {
    format = FileFormat.valueOf(table.properties()
        .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
        .toUpperCase(Locale.ENGLISH));
  }

  return Optional.of(new Writer(table, lazyConf(), format));
}

Example #3

Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0

5 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType schema,
    SaveMode mode, DataSourceOptions options) {
  Map<String, String> params = getOptions(options);
  String stagingDirPrefix = HWConf.LOAD_STAGING_DIR.getFromOptionsMap(params);
  Path path = new Path(stagingDirPrefix);
  Configuration conf = SparkSession.getActiveSession().get().sparkContext().hadoopConfiguration();
  return Optional.of(getDataSourceWriter(jobId, schema, path, params, conf));
}

Example #4

Source File: ParallelRowReadWriteDataSource.java From spark-data-sources with MIT License

4 votes

/**
 * Spark calls this to create the writer. The data source options are used
 * in the same way as above.
 * @param jobId
 * @param schema
 * @param mode
 * @param options
 * @return
 */
@Override
public Optional<DataSourceWriter> createWriter(
        String jobId, StructType schema, SaveMode mode, DataSourceOptions options)
{
    // TODO: ned to distinguish between creating the table for the first time
    // TODO: (just validate schema and create) vs appending (compare schema)

    // TODO: log JobId here and elsewhere whent he partitionId etc are logged

    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));

    edb.common.Schema dbSchema = DBClientWrapper.sparkToDbSchema(schema);

    boolean truncateOnCommit = false;

    DBClientWrapper db = new DBClientWrapper(host, port);
    db.connect();
    if (db.tableExists(table)) {
        switch (mode) {
            case ErrorIfExists: {
                // check existence and throw if needed
                throw new RuntimeException("data already exists");
            }
            case Append: {
                // just check schema compatibility
                try {
                    Schema actualSchema = db.getDBSchema(table);
                    if (!dbSchema.isCompatible(actualSchema)) {
                        throw new RuntimeException("Appending to table with incompatible schema");
                    }
                } catch (UnknownTableException ute) {
                    throw new RuntimeException(ute);
                }
                break;
            }
            case Overwrite: {
                // two options if table exists: truncate it now or truncate it later
                truncateOnCommit = true;
                break;
            }
            case Ignore: {
                // check existence and declare victory
                return Optional.empty();
            }
            default:
        }
    } else {
        db.createTable(table, dbSchema);
    }

    return Optional.of(new Writer(host, port, table, partitions, dbSchema, truncateOnCommit));
}

Example #5

Source File: HiveStreamingDataSource.java From spark-llap with Apache License 2.0

4 votes

@Override
public Optional<DataSourceWriter> createWriter(final String jobId, final StructType schema, final SaveMode mode,
  final DataSourceOptions options) {
  return Optional.of(createDataSourceWriter(jobId, schema, options));
}

Example #6

Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

protected DataSourceWriter getDataSourceWriter(String jobId, StructType schema,
    Path path, Map<String, String> options, Configuration conf) {
  return new HiveWarehouseDataSourceWriter(options, jobId, schema, path, conf);
}

Example #7

Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

@Override
protected DataSourceWriter getDataSourceWriter(String jobId, StructType schema,
    Path path, Map<String, String> options, Configuration conf) {
  return new MockWriteSupport.MockHiveWarehouseDataSourceWriter(options, jobId, schema, path, conf);
}