Java Code Examples for org.apache.spark.sql.SaveMode

The following examples show how to use org.apache.spark.sql.SaveMode. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: sparkResearch   Source File: DataSources.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    Dataset<Row> rddDataset = sparkSession.read().parquet("usr/local/data.parquet");
    rddDataset.select("name","age").write().save("nameAndAge.parquet");

    Dataset<Row> jsonDataSet = sparkSession.read().json("usr/local/data.json");
    jsonDataSet.select("name","age").write().save("nameAndAge.json");

    //手动指定数据源

    Dataset<Row> customDataSource = sparkSession.read().format("json").load("usr/local/data.json");
    customDataSource.select("name","age").write().format("json").mode(SaveMode.Append).save("nameAndAge.json");
}
 
Example 2
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example 3
Source Project: iceberg   Source File: TestIcebergSourceTablesBase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public synchronized void testTablesSupport() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
  createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "1"),
      new SimpleRecord(2, "2"),
      new SimpleRecord(3, "3"));

  Dataset<Row> inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(loadLocation(tableIdentifier));

  Dataset<Row> resultDf = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier));
  List<SimpleRecord> actualRecords = resultDf.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Records should match", expectedRecords, actualRecords);
}
 
Example 4
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {
	
    // remove spaces from column names as parquet does not support them
    for(String columnName : dataset.columns()) {
        if(columnName.contains(" ")) {
            String newColumnName = columnName.replace(' ', '_');
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }

    dataset.cache();
    BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config);

    if(config.isGenerateResultPreview()) {
        dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE);
    }

    return dataset;
}
 
Example 5
Source Project: hudi   Source File: HoodieSnapshotExporter.java    License: Apache License 2.0 6 votes vote down vote up
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}
 
Example 6
Source Project: stocator   Source File: TestSuite.java    License: Apache License 2.0 6 votes vote down vote up
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example 7
Source Project: iceberg   Source File: IcebergSourceBenchmark.java    License: Apache License 2.0 5 votes vote down vote up
protected void appendAsFile(Dataset<Row> ds) {
  // ensure the schema is precise (including nullability)
  StructType sparkSchema = SparkSchemaUtil.convert(table.schema());
  spark.createDataFrame(ds.rdd(), sparkSchema)
      .coalesce(1)
      .write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(table.location());
}
 
Example 8
@Benchmark
@Threads(1)
public void writeFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip");
  withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation()));
}
 
Example 9
@Benchmark
@Threads(1)
public void writeFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip");
  withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation()));
}
 
Example 10
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNullableWithWriteOption() throws IOException {
  Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2"));

  File location = new File(temp.newFolder("parquet"), "test");
  String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString());
  String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString());

  tableProperties = ImmutableMap.of(TableProperties.WRITE_NEW_DATA_LOCATION, targetPath);

  // read this and append to iceberg dataset
  spark
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1))
      .write().parquet(sourcePath);

  // this is our iceberg dataset to which we will append data
  new HadoopTables(spark.sessionState().newHadoopConf())
      .create(
          icebergSchema,
          PartitionSpec.builderFor(icebergSchema).identity("requiredField").build(),
          tableProperties,
          targetPath);

  // this is the initial data inside the iceberg dataset
  spark
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0))
      .write().format("iceberg").mode(SaveMode.Append).save(targetPath);

  // read from parquet and append to iceberg w/ nullability check disabled
  spark
      .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath)
      .write().format("iceberg").option("check-nullability", false).mode(SaveMode.Append).save(targetPath);

  // read all data
  List<Row> rows = spark.read().format("iceberg").load(targetPath).collectAsList();
  Assert.assertEquals("Should contain 6 rows", 6, rows.size());
}
 
Example 11
Source Project: hui-bigdata-spark   Source File: MetroAnalysisJob.java    License: Apache License 2.0 5 votes vote down vote up
/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }
 
Example 12
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    /*
    TODO: Not working yet
	// if output format is set to "csv" create both: csv and parquet 
	if(SparkImporterKafkaImportArguments.getInstance().getOutputFormat().equals(SparkImporterVariables.OUTPUT_FORMAT_CSV)) {
		dataset
        .write()
        .option("header", "true")
        .option("delimiter", ";")
        .option("ignoreLeadingWhiteSpace", "false")
        .option("ignoreTrailingWhiteSpace", "false")
        .mode(SparkImporterVariables.getSaveMode())
        .csv(SparkImporterVariables.getTargetFolder());
	}
	*/
  
	dataset
            //we repartition the data by process instances, which allows spark to better distribute the data between workers as the operations are related to a process instance
            .repartition(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID))
            .write()
            .mode(SaveMode.Append)
            .save(config.getTargetFolder());

    return dataset;
}
 
Example 13
public void createOrUpdateSparkRunnerConfig(SparkRunnerConfig config) {
    if(config == null) {
        config = new SparkRunnerConfig();
    }

    config.setTargetFolder(this.fileDestination);
    config.setWorkingDirectory(this.workingDirectory);
    config.setLogDirectory(this.logDirectory);
    config.setOutputFormat(this.outputFormat);
    config.setSaveMode(this.saveMode == BpmnaiVariables.SAVE_MODE_APPEND ? SaveMode.Append : SaveMode.Overwrite);
    config.setDataLevel(this.dataLevel);
    config.setWriteStepResultsIntoFile(this.writeStepResultsToCSV);
}
 
Example 14
Source Project: SparkDemo   Source File: SaveModelDemo.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);
	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出
	dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加
	dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误
	dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖

	sc.close();
}
 
Example 15
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
                                                 DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);

  Table table = findTable(options);

  Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
  List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
  if (!errors.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    sb.append("Cannot write incompatible dataframe to table with schema:\n")
        .append(table.schema()).append("\nProblems:");
    for (String error : errors) {
      sb.append("\n* ").append(error);
    }
    throw new IllegalArgumentException(sb.toString());
  }

  Optional<String> formatOption = options.get("iceberg.write.format");
  FileFormat format;
  if (formatOption.isPresent()) {
    format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH));
  } else {
    format = FileFormat.valueOf(table.properties()
        .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
        .toUpperCase(Locale.ENGLISH));
  }

  return Optional.of(new Writer(table, lazyConf(), format));
}
 
Example 16
Source Project: bunsen   Source File: AbstractValueSets.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Writes value records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param values a dataset of value records
 * @param tableName the table to write them to
 */
private static void writeValuesToTable(Dataset<Value> values, String tableName) {

  // Note the last two columns here must be the partitioned-by columns in order and in lower case
  // for Spark to properly match them to the partitions
  Dataset<Row> orderColumnDataset = values.select("system",
      "version",
      "value",
      "valueseturi",
      "valuesetversion");

  orderColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example 17
Source Project: bunsen   Source File: Hierarchies.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Writes ancestor records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param ancestors a dataset of ancestor records
 * @param tableName the table to write them to
 */
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {

  Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
      "descendantValue",
      "ancestorSystem",
      "ancestorValue",
      "uri",
      "version");

  orderedColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example 18
Source Project: envelope   Source File: HiveOutput.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {    
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = (doesAlignColumns) ? alignColumns(plan._2()) : plan._2();
    DataFrameWriter<Row> writer = mutation.write();

    if (partitionColumns != null) {
      writer = writer.partitionBy(partitionColumns);
    }

    if (options != null) {
      writer = writer.options(options);
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Hive output does not support mutation type: " + mutationType);
    }

    writer.insertInto(tableName);
  }
}
 
Example 19
Source Project: spark-llap   Source File: HiveWarehouseConnector.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType schema,
    SaveMode mode, DataSourceOptions options) {
  Map<String, String> params = getOptions(options);
  String stagingDirPrefix = HWConf.LOAD_STAGING_DIR.getFromOptionsMap(params);
  Path path = new Path(stagingDirPrefix);
  Configuration conf = SparkSession.getActiveSession().get().sparkContext().hadoopConfiguration();
  return Optional.of(getDataSourceWriter(jobId, schema, path, params, conf));
}
 
Example 20
Source Project: ignite   Source File: JavaIgniteDataFrameWriteExample.java    License: Apache License 2.0 5 votes vote down vote up
/** */
private static void editDataAndSaveToNewTable(Ignite ignite, SparkSession spark) {
    //Load content of Ignite table to data frame.
    Dataset<Row> personDataFrame = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person")
            .load();

    System.out.println("Data frame content:");

    //Printing content of data frame to console.
    personDataFrame.show();

    System.out.println("Modifying Data Frame and write it to Ignite:");

    personDataFrame
            .withColumn("id", col("id").plus(42)) //Edit id column
            .withColumn("name", reverse(col("name"))) //Edit name column
            .write().format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "new_persons")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS(), "id, city_id")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS(), "backups=1")
            .mode(SaveMode.Overwrite) //Overwriting entire table.
            .save();

    System.out.println("Done!");

    System.out.println("Reading data from Ignite table:");

    CacheConfiguration<?, ?> ccfg = new CacheConfiguration<>(CACHE_NAME);

    IgniteCache<?, ?> cache = ignite.getOrCreateCache(ccfg);

    //Reading saved data from Ignite.
    List<List<?>> data = cache.query(new SqlFieldsQuery("SELECT id, name, city_id FROM new_persons")).getAll();

    System.out.println(data);
}
 
Example 21
/**
 * Spark calls this to create the writer. The data source options are used
 * in the same way as above.
 * @param jobId
 * @param schema
 * @param mode
 * @param options
 * @return
 */
@Override
public Optional<DataSourceWriter> createWriter(
        String jobId, StructType schema, SaveMode mode, DataSourceOptions options)
{
    // TODO: ned to distinguish between creating the table for the first time
    // TODO: (just validate schema and create) vs appending (compare schema)

    // TODO: log JobId here and elsewhere whent he partitionId etc are logged

    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));

    edb.common.Schema dbSchema = DBClientWrapper.sparkToDbSchema(schema);

    boolean truncateOnCommit = false;

    DBClientWrapper db = new DBClientWrapper(host, port);
    db.connect();
    if (db.tableExists(table)) {
        switch (mode) {
            case ErrorIfExists: {
                // check existence and throw if needed
                throw new RuntimeException("data already exists");
            }
            case Append: {
                // just check schema compatibility
                try {
                    Schema actualSchema = db.getDBSchema(table);
                    if (!dbSchema.isCompatible(actualSchema)) {
                        throw new RuntimeException("Appending to table with incompatible schema");
                    }
                } catch (UnknownTableException ute) {
                    throw new RuntimeException(ute);
                }
                break;
            }
            case Overwrite: {
                // two options if table exists: truncate it now or truncate it later
                truncateOnCommit = true;
                break;
            }
            case Ignore: {
                // check existence and declare victory
                return Optional.empty();
            }
            default:
        }
    } else {
        db.createTable(table, dbSchema);
    }

    return Optional.of(new Writer(host, port, table, partitions, dbSchema, truncateOnCommit));
}
 
Example 22
@Benchmark
@Threads(1)
public void writeIceberg() {
  String tableLocation = table().location();
  benchmarkData().write().format("iceberg").mode(SaveMode.Append).save(tableLocation);
}
 
Example 23
@Benchmark
@Threads(1)
public void writeIceberg() {
  String tableLocation = table().location();
  benchmarkData().write().format("iceberg").mode(SaveMode.Append).save(tableLocation);
}
 
Example 24
Source Project: iceberg   Source File: TestDataFrameWrites.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testNullableWithSparkSqlOption() throws IOException {
  Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2"));

  File location = new File(temp.newFolder("parquet"), "test");
  String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString());
  String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString());

  tableProperties = ImmutableMap.of(TableProperties.WRITE_NEW_DATA_LOCATION, targetPath);

  // read this and append to iceberg dataset
  spark
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1))
      .write().parquet(sourcePath);

  SparkSession newSparkSession = SparkSession.builder()
      .master("local[2]")
      .appName("NullableTest")
      .config("spark.sql.iceberg.check-nullability", false)
      .getOrCreate();

  // this is our iceberg dataset to which we will append data
  new HadoopTables(newSparkSession.sessionState().newHadoopConf())
      .create(
          icebergSchema,
          PartitionSpec.builderFor(icebergSchema).identity("requiredField").build(),
          tableProperties,
          targetPath);

  // this is the initial data inside the iceberg dataset
  newSparkSession
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0))
      .write().format("iceberg").mode(SaveMode.Append).save(targetPath);

  // read from parquet and append to iceberg
  newSparkSession
      .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath)
      .write().format("iceberg").mode(SaveMode.Append).save(targetPath);

  // read all data
  List<Row> rows = newSparkSession.read().format("iceberg").load(targetPath).collectAsList();
  Assert.assertEquals("Should contain 6 rows", 6, rows.size());

}
 
Example 25
private void initializeWithEnvironmentVariables() {
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.WORKING_DIRECTORY)) != null) {
        setWorkingDirectory(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.WORKING_DIRECTORY)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.LOG_DIRECTORY)) != null) {
        setLogDirectory(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.LOG_DIRECTORY)));
        BpmnaiLogger.getInstance().setLogDirectory(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.LOG_DIRECTORY)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.FILE_SOURCE)) != null) {
        setSourceFolder(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.FILE_SOURCE)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.FILE_DESTINATION)) != null) {
        setTargetFolder(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.FILE_DESTINATION)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.SAVE_MODE)) != null) {
        setSaveMode(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.SAVE_MODE)) == BpmnaiVariables.SAVE_MODE_APPEND ? SaveMode.Append : SaveMode.Overwrite);
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.DATA_LEVEL)) != null) {
        setDataLevel(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.DATA_LEVEL)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.OUTPUT_FORMAT)) != null) {
        setOutputFormat(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.OUTPUT_FORMAT)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.WRITE_STEP_RESULTS)) != null) {
        setWriteStepResultsIntoFile(true);
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.DELIMITER)) != null) {
        setDelimiter(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.DELIMITER)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.OUTPUT_DELIMITER)) != null) {
        setOutputDelimiter(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.OUTPUT_DELIMITER)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.PROCESS_DEFINITION_FILTER)) != null) {
        setProcessFilterDefinitionId(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.PROCESS_DEFINITION_FILTER)));
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.BATCH_MODE)) != null) {
        setBatchMode(true);
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.KAFKA_BOOTSTRAP_SERVERS)) != null) {
        setKafkaBroker(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.KAFKA_BOOTSTRAP_SERVERS)));
    }

    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.JSON_PREVIEW)) != null) {
        setGenerateResultPreview(true);
    }
    if(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.JSON_PREVIEW_LINES)) != null) {
        setResultPreviewLineCount(Integer.parseInt(System.getenv(String.valueOf(ENVIRONMENT_VARIABLES.JSON_PREVIEW_LINES))));
    }
}
 
Example 26
public SaveMode getSaveMode() {
    return saveMode;
}
 
Example 27
public void setSaveMode(SaveMode saveMode) {
    this.saveMode = saveMode;
}
 
Example 28
public static void main(String[] args) throws AnalysisException {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("DatasetOperations")
      //.enableHiveSupport()
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	  //Create a RDD
	  JavaRDD<String> deptRDD = sparkSession.sparkContext()
			  .textFile("src/main/resources/dept.txt", 1)
			  .toJavaRDD();

	  //Convert the RDD to RDD<Rows>
	 JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() {
		private static final long serialVersionUID = 1L;
		@Override
		public Row call(String rowString) throws Exception {
			String[] cols = rowString.split(",");
		    return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim());
		}
	});
	  
	  //Create schema 		  
	  String[] schemaArr=deptRDD.first().split(",");
	  List<StructField> structFieldList = new ArrayList<>();
	  for (String fieldName : schemaArr) {
	    StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
	    structFieldList.add(structField);
	  }
	  StructType schema = DataTypes.createStructType(structFieldList);
	  
	  Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema);
	  deptDf.printSchema();
	  deptDf.show();
	  
	  deptDf.createOrReplaceTempView("dept");	
	  
	  Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept  where deptno > 10 group by loc" );
	  result.show();
	  
	  
	 // sparkSession.newSession().sql("SELECT * FROM dept").show();
	  
	  
        deptDf.createGlobalTempView("dept_global_view");
	  
	  sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show();
	 
	//  sparkSession.newSession().sql("SELECT * FROM dept_global_view").show();
	  
	  deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText");
	  deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText");
	 
  
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department");
	  
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
		 
		 emp_ds.printSchema();
		 emp_ds.show();
		 
		emp_ds.select("empName" ,"empId").show();
		
		emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show();
		
		emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500"));
		
		emp_ds.select("job").groupBy(col("job")).count().show();
		
		//emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema();

		emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show();			
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain();
		 
          sparkSession.sql("show functions").show(false);
          sparkSession.sql("DESCRIBE FUNCTION add_months").show(false);
          sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false);
          
         
}
 
Example 29
Source Project: bunsen   Source File: AbstractConceptMaps.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Writes mappings to the given tables.
 *
 * <p>Warning: these updates are likely <em>not</em> atomic due to the lack of transactional
 * semantics in the underlying data store. Concurrent users may see previous items
 * removed before new ones are added, or items appear separately than others. This is intended
 * for use in a user-specific sandbox or staging environment.
 *
 * @param mappingsTable name of the table containing the mapping records
 * @param conceptMapTable name of the table containing the concept map metadata
 */
public void writeToTables(String mappingsTable, String conceptMapTable) {

  boolean hasExistingMaps;

  try {

    this.spark.sql("describe table " + conceptMapTable);

    hasExistingMaps = true;

  } catch (Exception describeException) {

    // Checked exceptions when calling into Scala upset the Java compiler,
    // hence the need for this workaround and re-throw to propagate unexpected
    // failures.
    if (describeException instanceof NoSuchTableException) {

      hasExistingMaps = false;

    } else {

      throw new RuntimeException(describeException);
    }
  }

  if (!hasExistingMaps) {

    // No target tables exist, so create and write them. The mappings
    // and ancestors tables are created explicitly to meet our
    // partitioning system.
    createMappingTable(this.spark, mappingsTable, null);

    JavaSparkContext javaContext = new JavaSparkContext(spark.sparkContext());

    // Create a concept map table by writing empty data having the proper schema and properties
    this.spark.createDataFrame(javaContext.emptyRDD(), conceptMapRowConverter.getSchema())
        .withColumn("timestamp", lit(null).cast("timestamp"))
        .write()
        .format("parquet")
        .partitionBy("timestamp")
        .saveAsTable(conceptMapTable);
  }

  Dataset<UrlAndVersion> currentMembers = this.spark
      .sql("SELECT url, version FROM " + conceptMapTable)
      .distinct()
      .as(URL_AND_VERSION_ENCODER);

  if (hasDuplicateUrlAndVersions(currentMembers)) {

    throw new IllegalArgumentException("The given concept maps contains duplicates url and "
        + "versions against concept maps already stored in the table, " + conceptMapTable);
  }

  writeMappingsToTable(this.mappings, mappingsTable);

  this.conceptMaps.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(conceptMapTable);
}
 
Example 30
Source Project: bunsen   Source File: AbstractValueSets.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Writes value sets to the given tables.
 *
 * <p>Warning: these updates are likely <em>not</em> atomic due to the lack of transactional
 * semantics in the underlying data store. Concurrent users may see previous items
 * removed before new ones are added, or items appear separately than others. This is intended
 * for use in a user-specific sandbox or staging environment.
 *
 * @param valuesTable name of the table to which the value records are saved
 * @param valueSetTable name of the table to which the value set metadata is saved
 */
public void writeToTables(String valuesTable, String valueSetTable) {

  boolean hasExistingValueSets;

  try {

    spark.sql("DESCRIBE TABLE " + valueSetTable);

    hasExistingValueSets = true;

  } catch (Exception describeException) {

    // Checked exceptions when calling into Scala upset the Java compiler,
    // hence the need for this workaround and re-throw to propagate unexpected
    // failures.
    if (describeException instanceof NoSuchTableException) {

      hasExistingValueSets = false;

    } else {

      throw new RuntimeException(describeException);
    }
  }

  // If the target tables do not exist, we create them. The values and ancestors tables are
  // created explicitly to meet our partitioning system
  if (!hasExistingValueSets) {

    createValuesTable(spark, valuesTable, null);

    JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

    // Create a value set table by writing empty data having the proper schema and properties
    spark.createDataFrame(sparkContext.emptyRDD(), valueSetRowConverter.getSchema())
        .withColumn("timestamp", lit(null).cast("timestamp"))
        .write()
        .format("parquet")
        .partitionBy("timestamp")
        .saveAsTable(valueSetTable);

  }

  // Check existing value set URIs and Versions for duplicates among the new members
  Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable)
      .select("url", "version")
      .distinct()
      .as(URL_AND_VERSION_ENCODER);

  if (hasDuplicateUrlAndVersions(currentMembers)) {

    throw new IllegalArgumentException("The given value sets contains duplicate url and versions "
        + "against value sets already stored in the table, " + valueSetTable);
  }

  writeValuesToTable(this.values, valuesTable);

  this.valueSets.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(valueSetTable);
}