org.apache.spark.sql.SaveMode Java Examples

The following examples show how to use org.apache.spark.sql.SaveMode. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieSnapshotExporter.java    From hudi with Apache License 2.0 7 votes vote down vote up
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}
 
Example #2
Source File: DataSources.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    Dataset<Row> rddDataset = sparkSession.read().parquet("usr/local/data.parquet");
    rddDataset.select("name","age").write().save("nameAndAge.parquet");

    Dataset<Row> jsonDataSet = sparkSession.read().json("usr/local/data.json");
    jsonDataSet.select("name","age").write().save("nameAndAge.json");

    //手动指定数据源

    Dataset<Row> customDataSource = sparkSession.read().format("json").load("usr/local/data.json");
    customDataSource.select("name","age").write().format("json").mode(SaveMode.Append).save("nameAndAge.json");
}
 
Example #3
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example #4
Source File: WriteToDiscStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {
	
    // remove spaces from column names as parquet does not support them
    for(String columnName : dataset.columns()) {
        if(columnName.contains(" ")) {
            String newColumnName = columnName.replace(' ', '_');
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }

    dataset.cache();
    BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config);

    if(config.isGenerateResultPreview()) {
        dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE);
    }

    return dataset;
}
 
Example #5
Source File: TestIcebergSourceTablesBase.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public synchronized void testTablesSupport() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
  createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "1"),
      new SimpleRecord(2, "2"),
      new SimpleRecord(3, "3"));

  Dataset<Row> inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(loadLocation(tableIdentifier));

  Dataset<Row> resultDf = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier));
  List<SimpleRecord> actualRecords = resultDf.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Records should match", expectedRecords, actualRecords);
}
 
Example #6
Source File: IcebergSource.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}
 
Example #7
Source File: WriteToDataSinkStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    /*
    TODO: Not working yet
	// if output format is set to "csv" create both: csv and parquet 
	if(SparkImporterKafkaImportArguments.getInstance().getOutputFormat().equals(SparkImporterVariables.OUTPUT_FORMAT_CSV)) {
		dataset
        .write()
        .option("header", "true")
        .option("delimiter", ";")
        .option("ignoreLeadingWhiteSpace", "false")
        .option("ignoreTrailingWhiteSpace", "false")
        .mode(SparkImporterVariables.getSaveMode())
        .csv(SparkImporterVariables.getTargetFolder());
	}
	*/
  
	dataset
            //we repartition the data by process instances, which allows spark to better distribute the data between workers as the operations are related to a process instance
            .repartition(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID))
            .write()
            .mode(SaveMode.Append)
            .save(config.getTargetFolder());

    return dataset;
}
 
Example #8
Source File: HiveOutput.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {    
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = (doesAlignColumns) ? alignColumns(plan._2()) : plan._2();
    DataFrameWriter<Row> writer = mutation.write();

    if (partitionColumns != null) {
      writer = writer.partitionBy(partitionColumns);
    }

    if (options != null) {
      writer = writer.options(options);
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Hive output does not support mutation type: " + mutationType);
    }

    writer.insertInto(tableName);
  }
}
 
Example #9
Source File: IcebergSource.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
                                                 DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);

  Table table = findTable(options);

  Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
  List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
  if (!errors.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    sb.append("Cannot write incompatible dataframe to table with schema:\n")
        .append(table.schema()).append("\nProblems:");
    for (String error : errors) {
      sb.append("\n* ").append(error);
    }
    throw new IllegalArgumentException(sb.toString());
  }

  Optional<String> formatOption = options.get("iceberg.write.format");
  FileFormat format;
  if (formatOption.isPresent()) {
    format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH));
  } else {
    format = FileFormat.valueOf(table.properties()
        .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)
        .toUpperCase(Locale.ENGLISH));
  }

  return Optional.of(new Writer(table, lazyConf(), format));
}
 
Example #10
Source File: SaveModelDemo.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("SaveModelDemo").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);
	// 创建DataFrame 读取json
	SQLContext sqlContext = new SQLContext(sc);

	Dataset<Row> dataset = sqlContext.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	dataset.write().mode(SaveMode.ErrorIfExists).save("tmp/people2.json"); // 报错退出
	dataset.write().mode(SaveMode.Append).save("tmp/people2.json"); // 追加
	dataset.write().mode(SaveMode.Ignore).save("tmp/people2.json"); // 忽略错误
	dataset.write().mode(SaveMode.Overwrite).save("tmp/people2.json");// 覆盖

	sc.close();
}
 
Example #11
Source File: AbstractValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Writes value records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param values a dataset of value records
 * @param tableName the table to write them to
 */
private static void writeValuesToTable(Dataset<Value> values, String tableName) {

  // Note the last two columns here must be the partitioned-by columns in order and in lower case
  // for Spark to properly match them to the partitions
  Dataset<Row> orderColumnDataset = values.select("system",
      "version",
      "value",
      "valueseturi",
      "valuesetversion");

  orderColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example #12
Source File: HiveWarehouseConnector.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType schema,
    SaveMode mode, DataSourceOptions options) {
  Map<String, String> params = getOptions(options);
  String stagingDirPrefix = HWConf.LOAD_STAGING_DIR.getFromOptionsMap(params);
  Path path = new Path(stagingDirPrefix);
  Configuration conf = SparkSession.getActiveSession().get().sparkContext().hadoopConfiguration();
  return Optional.of(getDataSourceWriter(jobId, schema, path, params, conf));
}
 
Example #13
Source File: JavaIgniteDataFrameWriteExample.java    From ignite with Apache License 2.0 5 votes vote down vote up
/** */
private static void editDataAndSaveToNewTable(Ignite ignite, SparkSession spark) {
    //Load content of Ignite table to data frame.
    Dataset<Row> personDataFrame = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person")
            .load();

    System.out.println("Data frame content:");

    //Printing content of data frame to console.
    personDataFrame.show();

    System.out.println("Modifying Data Frame and write it to Ignite:");

    personDataFrame
            .withColumn("id", col("id").plus(42)) //Edit id column
            .withColumn("name", reverse(col("name"))) //Edit name column
            .write().format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "new_persons")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS(), "id, city_id")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS(), "backups=1")
            .mode(SaveMode.Overwrite) //Overwriting entire table.
            .save();

    System.out.println("Done!");

    System.out.println("Reading data from Ignite table:");

    CacheConfiguration<?, ?> ccfg = new CacheConfiguration<>(CACHE_NAME);

    IgniteCache<?, ?> cache = ignite.getOrCreateCache(ccfg);

    //Reading saved data from Ignite.
    List<List<?>> data = cache.query(new SqlFieldsQuery("SELECT id, name, city_id FROM new_persons")).getAll();

    System.out.println(data);
}
 
Example #14
Source File: AbstractArguments.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public void createOrUpdateSparkRunnerConfig(SparkRunnerConfig config) {
    if(config == null) {
        config = new SparkRunnerConfig();
    }

    config.setTargetFolder(this.fileDestination);
    config.setWorkingDirectory(this.workingDirectory);
    config.setLogDirectory(this.logDirectory);
    config.setOutputFormat(this.outputFormat);
    config.setSaveMode(this.saveMode == BpmnaiVariables.SAVE_MODE_APPEND ? SaveMode.Append : SaveMode.Overwrite);
    config.setDataLevel(this.dataLevel);
    config.setWriteStepResultsIntoFile(this.writeStepResultsToCSV);
}
 
Example #15
Source File: IcebergSourceBenchmark.java    From iceberg with Apache License 2.0 5 votes vote down vote up
protected void appendAsFile(Dataset<Row> ds) {
  // ensure the schema is precise (including nullability)
  StructType sparkSchema = SparkSchemaUtil.convert(table.schema());
  spark.createDataFrame(ds.rdd(), sparkSchema)
      .coalesce(1)
      .write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(table.location());
}
 
Example #16
Source File: MetroAnalysisJob.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }
 
Example #17
Source File: Hierarchies.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Writes ancestor records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described <a
 * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
 *
 * @param ancestors a dataset of ancestor records
 * @param tableName the table to write them to
 */
private static void writeAncestorsToTable(Dataset<Ancestor> ancestors, String tableName) {

  Dataset<Row> orderedColumnDataset = ancestors.select("descendantSystem",
      "descendantValue",
      "ancestorSystem",
      "ancestorValue",
      "uri",
      "version");

  orderedColumnDataset.write()
      .mode(SaveMode.ErrorIfExists)
      .insertInto(tableName);
}
 
Example #18
Source File: TestDataFrameWrites.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullableWithWriteOption() throws IOException {
  Assume.assumeTrue("Spark 3.0 rejects writing nulls to a required column", spark.version().startsWith("2"));

  File location = new File(temp.newFolder("parquet"), "test");
  String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString());
  String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString());

  tableProperties = ImmutableMap.of(TableProperties.WRITE_NEW_DATA_LOCATION, targetPath);

  // read this and append to iceberg dataset
  spark
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data1))
      .write().parquet(sourcePath);

  // this is our iceberg dataset to which we will append data
  new HadoopTables(spark.sessionState().newHadoopConf())
      .create(
          icebergSchema,
          PartitionSpec.builderFor(icebergSchema).identity("requiredField").build(),
          tableProperties,
          targetPath);

  // this is the initial data inside the iceberg dataset
  spark
      .read().schema(sparkSchema).json(
      JavaSparkContext.fromSparkContext(spark.sparkContext()).parallelize(data0))
      .write().format("iceberg").mode(SaveMode.Append).save(targetPath);

  // read from parquet and append to iceberg w/ nullability check disabled
  spark
      .read().schema(SparkSchemaUtil.convert(icebergSchema)).parquet(sourcePath)
      .write().format("iceberg").option("check-nullability", false).mode(SaveMode.Append).save(targetPath);

  // read all data
  List<Row> rows = spark.read().format("iceberg").load(targetPath).collectAsList();
  Assert.assertEquals("Should contain 6 rows", 6, rows.size());
}
 
Example #19
Source File: IcebergSourceNestedParquetDataWriteBenchmark.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Benchmark
@Threads(1)
public void writeFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip");
  withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation()));
}
 
Example #20
Source File: IcebergSourceFlatParquetDataWriteBenchmark.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Benchmark
@Threads(1)
public void writeFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.PARQUET_COMPRESSION().key(), "gzip");
  withSQLConf(conf, () -> benchmarkData().write().mode(SaveMode.Append).parquet(dataLocation()));
}
 
Example #21
Source File: SparkDataSetTest.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Test
    public void testFoobar() {
        List<Row> foo = new ArrayList();
        for (int i = 0; i< 10; i++) {
            ValueRow row = new ValueRow(1);
            row.setColumn(1,new SQLInteger(i));
            foo.add(row);
        }

        StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("col1", DataTypes.IntegerType, true)});

//        ValueRow row = new ValueRow(2);
//        row.setColumn(1,new SQLDouble());
//        row.setColumn(2,new SQLInteger());

/*

        SpliceSpark.getSession().read().parquet("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/simple_parquet")
                .select(new Column("0"),new Column("1"))
                .filter(col("0").gt(1).or(col("0").lt(4))).explain(true);
*/
        SpliceSpark.getSessionUnsafe().createDataFrame(foo,schema).write().format("orc").mode(SaveMode.Append)
                .orc("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/orc_it");

        Column filter = (new Column("col1")).gt(1l).and(new Column("col1").lt(1l));

        SpliceSpark.getSessionUnsafe().read().schema(schema)
                .orc("/Users/jleach/Documents/workspace/spliceengine/hbase_sql/target/external/orc_it")
                .filter(filter).show();
//                .select(new Column("0"),new Column("1")).show();

/*
        Dataset<Row> leftSide = SpliceSpark.getSession().createDataFrame(foo,foo.get(0).schema());
        Dataset<Row> rightSide = SpliceSpark.getSession().createDataFrame(foo.subList(0,8),foo.get(0).schema());

        Column col =
                (leftSide.col("0").equalTo(rightSide.col("0"))).
                and((leftSide.col("1")).equalTo(rightSide.col("1")));
        leftSide.join(rightSide,col,"inner").explain(true);
        leftSide.join(rightSide,col,"inner").show(10);
        leftSide.join(broadcast(rightSide),col,"leftouter").explain(true);
        leftSide.join(broadcast(rightSide),col,"leftouter").show(10);
        leftSide.join(broadcast(rightSide),col,"leftanti").show(10);
        */
    }
 
Example #22
Source File: HoodieJavaStreamingApp.java    From hudi with Apache License 2.0 4 votes vote down vote up
/**
 * Adding data to the streaming source and showing results over time.
 * 
 * @param spark
 * @param fs
 * @param inputDF1
 * @param inputDF2
 * @throws Exception
 */
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
  inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
  // wait for spark streaming to process one microbatch
  Thread.sleep(3000);
  String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
  LOG.info("First commit at instant time :" + commitInstantTime1);

  inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
  // wait for spark streaming to process one microbatch
  Thread.sleep(3000);
  String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
  LOG.info("Second commit at instant time :" + commitInstantTime2);

  /**
   * Read & do some queries
   */
  Dataset<Row> hoodieROViewDF = spark.read().format("org.apache.hudi")
      // pass any path glob, can include hoodie & non-hoodie
      // datasets
      .load(tablePath + "/*/*/*/*");
  hoodieROViewDF.registerTempTable("hoodie_ro");
  spark.sql("describe hoodie_ro").show();
  // all trips whose fare amount was greater than 2.
  spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();

  if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
    /**
     * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
     */
    Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
        .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
        // Only changes in write 2 above
        .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
        // For incremental view, pass in the root/base path of dataset
        .load(tablePath);

    LOG.info("You will only see records from : " + commitInstantTime2);
    hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
  }
}
 
Example #23
Source File: FileSystemOutput.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = plan._2();

    DataFrameWriter<Row> writer = mutation.write();

    if (columns != null) {
      LOG.debug("Partitioning output");

      writer = writer.partitionBy(columns.toArray(new String[columns.size()]));
    }

    switch (mutationType) {
      case INSERT:
        writer = writer.mode(SaveMode.Append);
        break;
      case OVERWRITE:
        writer = writer.mode(SaveMode.Overwrite);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support mutation type: " + mutationType);
    }

    switch (format) {
      case PARQUET_FORMAT:
        LOG.debug("Writing Parquet: {}", path);
        writer.parquet(path);
        break;
      case CSV_FORMAT:
        LOG.debug("Writing CSV: {}", path);
        writer.options(options).csv(path);
        break;
      case JSON_FORMAT:
        LOG.debug("Writing JSON: {}", path);
        writer.json(path);
        break;
      default:
        throw new RuntimeException("Filesystem output does not support file format: " + format);
    }
  }
}
 
Example #24
Source File: HiveStreamingDataSource.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(final String jobId, final StructType schema, final SaveMode mode,
  final DataSourceOptions options) {
  return Optional.of(createDataSourceWriter(jobId, schema, options));
}
 
Example #25
Source File: ProcessVendorTrasactions.java    From aws-big-data-blog with Apache License 2.0 4 votes vote down vote up
public static void run(String jobInputParam) throws Exception{
	
   	List<StructField> schemaFields = new ArrayList<StructField>();
   	schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true));
   	StructType schema = DataTypes.createStructType(schemaFields);

   	SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys");
   	SparkSession spark = SparkSession.builder().config(conf).getOrCreate();	
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
			
	String redshiftJDBCURL=props.getProperty("redshift.jdbc.url");
	String s3TempPath = props.getProperty("s3.temp.path");
	System.out.println("props"+props);
	
	JavaRDD<Row> salesRDD = sc.textFile(jobInputParam).
			map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(",");
		      return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}});
	Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema);
	Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	String[] joinColArray = {"vendor_id","item_id","trans_date"};
	vendorItemSaleAmountDF.printSchema();
	Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq();

	Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer")
							 .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer")
							 .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount");
	
	vendorAggregatedDF.printSchema();
	DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain();
	AWSSessionCredentials creds  = (AWSSessionCredentials) provider.getCredentials();
	
	String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString();
	String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp")
			 .append(appendix)
			 .append(" where vendortranssummary.vendor_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp")
			 .append(appendix)
			 .append(".trans_date;")
			 .append("insert into vendortranssummary select * from vendortranssummary_temp")
			 .append(appendix)
			 .append(";drop table vendortranssummary_temp")
			 .append(appendix)
			 .append(";end transaction;").toString();
	vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL)
    .option("dbtable", "vendortranssummary_temp"+appendix)
    .option("usestagingtable","false")
    .option("postactions",vendorTransSummarySQL)
    .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId())
    .option("temporary_aws_secret_access_key",creds.getAWSSecretKey())
    .option("temporary_aws_session_token", creds.getSessionToken())
    .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save();
		
}
 
Example #26
Source File: SparkDataSet.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeAvroFile(DataSetProcessor dsp,
                                      int[] partitionBy,
                                      String location,
                                      String compression,
                                      OperationContext context) throws StandardException
{
    compression = SparkDataSet.getAvroCompression(compression);

    StructType dataSchema = null;
    StructType tableSchema = generateTableSchema(context);

    // what is this? why is this so different from parquet/orc ?
    // actually very close to NativeSparkDataSet.writeFile
    dataSchema = ExternalTableUtils.getDataSchema(dsp, tableSchema, partitionBy, location, "a");

    if (dataSchema == null)
        dataSchema = tableSchema;

    Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
            rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowAvroFunction()),
            dataSchema);


    // We duplicate the code in NativeSparkDataset.writeAvroFile here to avoid calling  ExternalTableUtils.getDataSchema() twice
    List<String> partitionByCols = new ArrayList();
    for (int i = 0; i < partitionBy.length; i++) {
        partitionByCols.add(dataSchema.fields()[partitionBy[i]].name());
    }
    if (partitionBy.length > 0) {
        List<Column> repartitionCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            repartitionCols.add(new Column(dataSchema.fields()[partitionBy[i]].name()));
        }
        insertDF = insertDF.repartition(scala.collection.JavaConversions.asScalaBuffer(repartitionCols).toList());
    }
    if (compression.equals("none")) {
        compression = "uncompressed";
    }
    insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
            .mode(SaveMode.Append).format("com.databricks.spark.avro").save(location);
    ValueRow valueRow=new ValueRow(1);
    valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
    return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
}
 
Example #27
Source File: SparkSqlCommand.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Override
public Void computeResults(final OperationParams params) throws Exception {

  // Config file
  final File configFile = getGeoWaveConfigFile(params);

  final String sql = parameters.get(0);

  LOGGER.debug("Input SQL: " + sql);
  final String cleanSql =
      initStores(configFile, sql, sparkSqlOptions.getOutputStoreName(), params.getConsole());

  LOGGER.debug("Running with cleaned SQL: " + cleanSql);
  sqlRunner.setSql(cleanSql);
  sqlRunner.setAppName(sparkSqlOptions.getAppName());
  sqlRunner.setHost(sparkSqlOptions.getHost());
  sqlRunner.setMaster(sparkSqlOptions.getMaster());

  stopwatch.reset();
  stopwatch.start();

  // Execute the query
  final Dataset<Row> results = sqlRunner.run();

  stopwatch.stop();

  if (LOGGER.isDebugEnabled()) {
    LOGGER.debug("Spark SQL query took " + stopwatch.getTimeString());
    LOGGER.debug("   and got " + results.count() + " results");
    results.printSchema();
  }

  if (sparkSqlOptions.getShowResults() > 0) {
    results.show(sparkSqlOptions.getShowResults(), false);
  }

  params.getConsole().println("GeoWave SparkSQL query returned " + results.count() + " results");

  if (outputDataStore != null) {
    final SqlResultsWriter sqlResultsWriter = new SqlResultsWriter(results, outputDataStore);

    String typeName = sparkSqlOptions.getOutputTypeName();
    if (typeName == null) {
      typeName = "sqlresults";
    }

    params.getConsole().println("Writing GeoWave SparkSQL query results to datastore...");
    sqlResultsWriter.writeResults(typeName);
    params.getConsole().println("Datastore write complete.");
  }

  if (sparkSqlOptions.getCsvOutputFile() != null) {
    results.repartition(1).write().format("com.databricks.spark.csv").option(
        "header",
        "true").mode(SaveMode.Overwrite).save(sparkSqlOptions.getCsvOutputFile());
  }
  sqlRunner.close();
  return null;
}
 
Example #28
Source File: AppMain.java    From SparkToParquet with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example #29
Source File: DbPersistorSQLServer.java    From rdf2x with Apache License 2.0 4 votes vote down vote up
public DbPersistorSQLServer(DbConfig config, SaveMode saveMode) {
    super(config, saveMode);
}
 
Example #30
Source File: DbPersistorPostgres.java    From rdf2x with Apache License 2.0 4 votes vote down vote up
public DbPersistorPostgres(DbConfig config, SaveMode saveMode) {
    super(config, saveMode);
}