Java Code Examples for org.apache.spark.sql.Encoders

The following examples show how to use org.apache.spark.sql.Encoders. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: metron   Source File: TextEncodedTelemetryReaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCSV() {
  // re-write the test data as a CSV with a header record
  String pathToCSV = tempFolder.getRoot().getAbsolutePath();
  spark.read()
          .format("text")
          .load("src/test/resources/telemetry.json")
          .as(Encoders.STRING())
          .write()
          .mode("overwrite")
          .option("header", "true")
          .format("csv")
          .save(pathToCSV);

  // tell the profiler to use the CSV input data
  profilerProperties.put(TELEMETRY_INPUT_PATH.getKey(), pathToCSV);
  profilerProperties.put(TELEMETRY_INPUT_FORMAT.getKey(), "csv");

  // set a reader property; tell the reader to expect a header
  readerProperties.put("header", "true");

  // there should be 100 valid JSON records
  Dataset<String> telemetry = TelemetryReaders.TEXT.read(spark, profilerProperties, readerProperties);
  assertEquals(100, telemetry.filter(new IsValidJSON()).count());
}
 
Example 2
Source Project: envelope   Source File: TestSQLDeriver.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testQueryFile() throws Exception {
  Contexts.getSparkSession().createDataset(Lists.newArrayList(1), Encoders.INT()).createOrReplaceTempView("literaltable");

  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(SQLDeriver.QUERY_FILE_CONFIG_NAME, getClass().getResource("/sql/query_without_parameters.sql").getPath());
  Config config = ConfigFactory.parseMap(configMap);

  SQLDeriver deriver = new SQLDeriver();
  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  Object result = deriver.derive(Maps.<String, Dataset<Row>>newHashMap()).collectAsList().get(0).get(0);

  assertEquals(1, result);
}
 
Example 3
Source Project: iceberg   Source File: TestIcebergSourceTablesBase.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public synchronized void testTablesSupport() {
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
  createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "1"),
      new SimpleRecord(2, "2"),
      new SimpleRecord(3, "3"));

  Dataset<Row> inputDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  inputDf.select("id", "data").write()
      .format("iceberg")
      .mode(SaveMode.Append)
      .save(loadLocation(tableIdentifier));

  Dataset<Row> resultDf = spark.read()
      .format("iceberg")
      .load(loadLocation(tableIdentifier));
  List<SimpleRecord> actualRecords = resultDf.orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Records should match", expectedRecords, actualRecords);
}
 
Example 4
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 6 votes vote down vote up
RewriteManifestsAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class);
  this.table = table;
  this.spec = table.spec();
  this.targetManifestSizeBytes = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.MANIFEST_TARGET_SIZE_BYTES,
      TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
  this.fileIO = SparkUtil.serializableFileIO(table);

  // default the staging location to the metadata location
  TableOperations ops = ((HasTableOperations) table).operations();
  Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
  this.stagingLocation = metadataFilePath.getParent().toString();

  // use the current table format version for new manifests
  this.formatVersion = ops.current().formatVersion();
}
 
Example 5
Source Project: iceberg   Source File: RemoveOrphanFilesAction.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public List<String> execute() {
  Dataset<Row> validDataFileDF = buildValidDataFileDF();
  Dataset<Row> validMetadataFileDF = buildValidMetadataFileDF();
  Dataset<Row> validFileDF = validDataFileDF.union(validMetadataFileDF);
  Dataset<Row> actualFileDF = buildActualFileDF();

  Column nameEqual = filename.apply(actualFileDF.col("file_path"))
      .equalTo(filename.apply(validFileDF.col("file_path")));
  Column actualContains = actualFileDF.col("file_path").contains(validFileDF.col("file_path"));
  Column joinCond = nameEqual.and(actualContains);
  List<String> orphanFiles = actualFileDF.join(validFileDF, joinCond, "leftanti")
      .as(Encoders.STRING())
      .collectAsList();

  Tasks.foreach(orphanFiles)
      .noRetry()
      .suppressFailureWhenFinished()
      .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc))
      .run(deleteFunc::accept);

  return orphanFiles;
}
 
Example 6
Source Project: iceberg   Source File: RemoveOrphanFilesAction.java    License: Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example 7
Source Project: sylph   Source File: StructuredNodeLoader.java    License: Apache License 2.0 6 votes vote down vote up
private static TransForm<Dataset<Row>> loadRealTimeTransForm(RealTimeTransForm realTimeTransForm)
{
    return stream -> {
        //spark2.x 要对dataSet 进行map操作必须要加上下面一句类型映射 即必须要指明返回的schema
        //implicit val matchError:org.apache.spark.sql.Encoder[Row] = org.apache.spark.sql.Encoders.kryo[Row]
        //      import collection.JavaConverters._
        //      val mapRowSchema = realTimeTransForm.getRowSchema.getFields.asScala.map(filed => {
        //        StructField(filed.getName, SparkRow.SparkRowParser.parserType(filed.getJavaType), true)
        //      })
        //      RowEncoder.apply(StructType(mapRowSchema))

        //implicit val mapenc = RowEncoder.apply(rddSchema)  //此处无法注册 原因是必须是sql基本类型   //Encoders.STRING
        Dataset<Row> transStream = stream.mapPartitions(
                (MapPartitionsFunction<Row, Row>) partition -> StreamNodeLoader.transFunction(partition, realTimeTransForm),
                Encoders.kryo(Row.class));
        //或者使用 transStream.as()
        return transStream;
    };
}
 
Example 8
Source Project: mmtf-spark   Source File: MutationToStructureDemo.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local[*]").appName(MutationToStructureDemo.class.getSimpleName())
            .getOrCreate();

    // find missense mutations that map to UniProt ID P15056 (BRAF)
    // that are annotated as pathogenic or likely pathogenic in ClinVar.
    List<String> uniprotIds = Arrays.asList("P15056"); // BRAF: P15056
    String query = "clinvar.rcv.clinical_significance:pathogenic OR clinvar.rcv.clinical_significance:likely pathogenic";
    Dataset<Row> df = MyVariantDataset.getVariations(uniprotIds, query).cache();
    System.out.println("BRAF missense mutations: " + df.count());
    df.show();
    
    // extract the list of variant Ids
    List<String> variantIds = df.select("variationId").as(Encoders.STRING()).collectAsList();
    
    // map to PDB structures
    Dataset<Row> ds = G2SDataset.getPositionDataset(variantIds);
    ds = ds.sort("structureId","chainId","pdbPosition");
    ds.show();

    spark.close(); 
}
 
Example 9
Source Project: mmtf-spark   Source File: PdbToUniProt.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns an up-to-date dataset of PDB to UniProt
 * chain-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainIds (e.g., 1XYZ.A).
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt chain-level mappings
 * @throws IOException
 */
public static Dataset<Row> getChainMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    // get a dataset of up-to-date UniProt chain mappings
    Dataset<Row> ds = getChainMappings();  
    // create a dataset of ids from the passed-in list
    Dataset<Row> subset = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    
    // create subsets of data
    if (!ids.isEmpty()) {
        if (ids.get(0).length() == 4) {
            // join by pdbId
            ds = ds.join(subset, ds.col("structureId").equalTo(subset.col("id"))).drop("id");    
        } else {
            // join by pdbChainId
            ds = ds.join(subset, ds.col("structureChainId").equalTo(subset.col("id"))).drop("id");    
        }
    }
    
    return ds;
}
 
Example 10
Source Project: net.jgp.labs.spark   Source File: CsvToDatasetBookAsJson.java    License: Apache License 2.0 6 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "CSV to Dataset<Book> as JSON").master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
  bookDf.show(20, 132);

  Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
  bookAsJsonDf.show();
}
 
Example 11
Source Project: envelope   Source File: TestSQLDeriver.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testQueryLiteral() throws Exception {
  Contexts.getSparkSession().createDataset(Lists.newArrayList(1), Encoders.INT()).createOrReplaceTempView("literaltable");

  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(SQLDeriver.QUERY_LITERAL_CONFIG_NAME, "SELECT * FROM literaltable");
  Config config = ConfigFactory.parseMap(configMap);

  SQLDeriver deriver = new SQLDeriver();
  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  Object result = deriver.derive(Maps.<String, Dataset<Row>>newHashMap()).collectAsList().get(0).get(0);

  assertEquals(1, result);
}
 
Example 12
Source Project: bunsen   Source File: BundlesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testXmlBundleStrings() {

  JavaRDD<String> xmlBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/xml/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(),
      Encoders.STRING());

  xmlBundles.write().saveAsTable("xml_bundle_table");

  JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml(
      spark.sql("select value from xml_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundles,
      Patient.class);

  checkPatients(patients);
}
 
Example 13
Source Project: bunsen   Source File: BundlesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonBundleStrings() {

  JavaRDD<String> jsonBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/json/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(),
      Encoders.STRING());

  jsonBundles.write().saveAsTable("json_bundle_table");

  JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson(
      spark.sql("select value from json_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundlesRdd,
      Patient.class);

  checkPatients(patients);
}
 
Example 14
Source Project: bunsen   Source File: BundlesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testJsonBundleStrings() {

  JavaRDD<String> jsonBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/json/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> jsonBundles = spark.createDataset(jsonBundlesRdd.rdd(),
      Encoders.STRING());

  jsonBundles.write().saveAsTable("json_bundle_table");

  JavaRDD<BundleContainer> bundlesRdd = bundles.fromJson(
      spark.sql("select value from json_bundle_table"), "value");

  Dataset<Patient> patients = BundlesTest.bundles.extractEntry(spark,
      bundlesRdd,
      Patient.class);

  checkPatients(patients);
}
 
Example 15
Source Project: sparkResearch   Source File: DataSetApplication.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}
 
Example 16
Source Project: net.jgp.labs.spark   Source File: ReducerApp.java    License: Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().master("local").getOrCreate();

  List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
  Dataset<Integer> df = spark.createDataset(data, Encoders.INT());
  df.show();
  df.printSchema();
  Integer sumByReduce = df.reduce(new SumByReduce());
  System.out.println("Sum should be 55 and it is... " + sumByReduce);
}
 
Example 17
/**
 * Main function.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark SQL user-defined Datasets aggregation example")
    .getOrCreate();

  Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
  String path = args[0];
  Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
  ds.show();
  // +-------+------+
  // |   name|salary|
  // +-------+------+
  // |Michael|  3000|
  // |   Andy|  4500|
  // | Justin|  3500|
  // |  Berta|  4000|
  // +-------+------+

  MyAverage myAverage = new MyAverage();
  // Convert the function to a `TypedColumn` and give it a name
  TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
  Dataset<Double> result = ds.select(averageSalary);
  result.show();
  // +--------------+
  // |average_salary|
  // +--------------+
  // |        3750.0|
  // +--------------+
  spark.stop();
}
 
Example 18
Source Project: iceberg   Source File: TestRemoveOrphanFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMetadataFolderIsIntact() throws InterruptedException {
  // write data directly to the table location
  Map<String, String> props = Maps.newHashMap();
  props.put(TableProperties.WRITE_NEW_DATA_LOCATION, tableLocation);
  Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList(
      new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")
  );
  Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);

  df.select("c1", "c2", "c3")
      .write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA");

  // sleep for 1 second to unsure files will be old enough
  Thread.sleep(1000);

  Actions actions = Actions.forTable(table);

  List<String> result = actions.removeOrphanFiles()
      .olderThan(System.currentTimeMillis())
      .execute();

  Assert.assertEquals("Should delete 1 file", 1, result.size());

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();
  Assert.assertEquals("Rows must match", records, actualRecords);
}
 
Example 19
Source Project: iceberg   Source File: TestRemoveOrphanFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testManyTopLevelPartitions() throws InterruptedException {
  Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList();
  for (int i = 0; i < 100; i++) {
    records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i)));
  }

  Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);

  df.select("c1", "c2", "c3")
      .write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  // sleep for 1 second to unsure files will be old enough
  Thread.sleep(1000);

  Actions actions = Actions.forTable(table);

  List<String> result = actions.removeOrphanFiles()
      .olderThan(System.currentTimeMillis())
      .execute();

  Assert.assertTrue("Should not delete any files", result.isEmpty());

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();
  Assert.assertEquals("Rows must match", records, actualRecords);
}
 
Example 20
Source Project: iceberg   Source File: TestRemoveOrphanFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testManyLeafPartitions() throws InterruptedException {
  Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList();
  for (int i = 0; i < 100; i++) {
    records.add(new ThreeColumnRecord(i, String.valueOf(i % 3), String.valueOf(i)));
  }

  Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);

  df.select("c1", "c2", "c3")
      .write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  // sleep for 1 second to unsure files will be old enough
  Thread.sleep(1000);

  Actions actions = Actions.forTable(table);

  List<String> result = actions.removeOrphanFiles()
      .olderThan(System.currentTimeMillis())
      .execute();

  Assert.assertTrue("Should not delete any files", result.isEmpty());

  Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
  List<ThreeColumnRecord> actualRecords = resultDF
      .as(Encoders.bean(ThreeColumnRecord.class))
      .collectAsList();
  Assert.assertEquals("Rows must match", records, actualRecords);
}
 
Example 21
Source Project: iceberg   Source File: TestRemoveOrphanFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
private List<String> snapshotFiles(long snapshotId) {
  return spark.read().format("iceberg")
      .option("snapshot-id", snapshotId)
      .load(tableLocation + "#files")
      .select("file_path")
      .as(Encoders.STRING())
      .collectAsList();
}
 
Example 22
@Test
public void testImportPartitions() throws IOException {
  Table table = TABLES.create(SCHEMA, SPEC, tableLocation);

  List<SimpleRecord> records = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  File parquetTableDir = temp.newFolder("parquet_table");
  String parquetTableLocation = parquetTableDir.toURI().toString();

  try {
    Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
    inputDF.select("id", "data").write()
        .format("parquet")
        .mode("append")
        .option("path", parquetTableLocation)
        .partitionBy("data")
        .saveAsTable("parquet_table");

    File stagingDir = temp.newFolder("staging-dir");
    List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
    SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());

    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));

    List<SimpleRecord> actualRecords = spark.read()
        .format("iceberg")
        .load(tableLocation)
        .orderBy("id")
        .as(Encoders.bean(SimpleRecord.class))
        .collectAsList();

    Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
  } finally {
    spark.sql("DROP TABLE parquet_table");
  }
}
 
Example 23
Source Project: iceberg   Source File: TestFilteredScan.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedStartsWith() {
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  List<String> matchedData = df.select("data")
      .where("data LIKE 'jun%'")
      .as(Encoders.STRING())
      .collectAsList();

  Assert.assertEquals(1, matchedData.size());
  Assert.assertEquals("junction", matchedData.get(0));
}
 
Example 24
Source Project: net.jgp.labs.spark   Source File: BookUrlBuilderApp.java    License: Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("Book URL Builder")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
  ds.printSchema();
  ds.show(20, 80);
}
 
Example 25
Source Project: iceberg   Source File: RemoveOrphanFilesAction.java    License: Apache License 2.0 5 votes vote down vote up
private Dataset<Row> buildValidMetadataFileDF() {
  String allManifestsMetadataTable = metadataTableName(MetadataTableType.ALL_MANIFESTS);
  Dataset<Row> manifestDF = spark.read().format("iceberg")
      .load(allManifestsMetadataTable)
      .selectExpr("path as file_path");

  List<String> otherMetadataFiles = Lists.newArrayList();

  for (Snapshot snapshot : table.snapshots()) {
    String manifestListLocation = snapshot.manifestListLocation();
    if (manifestListLocation != null) {
      otherMetadataFiles.add(manifestListLocation);
    }
  }

  otherMetadataFiles.add(ops.metadataFileLocation("version-hint.text"));

  TableMetadata metadata = ops.current();
  otherMetadataFiles.add(metadata.metadataFileLocation());
  for (TableMetadata.MetadataLogEntry previousMetadataFile : metadata.previousFiles()) {
    otherMetadataFiles.add(previousMetadataFile.file());
  }

  Dataset<Row> otherMetadataFileDF = spark
      .createDataset(otherMetadataFiles, Encoders.STRING())
      .toDF("file_path");

  return manifestDF.union(otherMetadataFileDF);
}
 
Example 26
Source Project: iceberg   Source File: TestDataSourceOptions.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testHadoopOptions() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();
  Configuration sparkHadoopConf = spark.sessionState().newHadoopConf();
  String originalDefaultFS = sparkHadoopConf.get("fs.default.name");

  try {
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    tables.create(SCHEMA, spec, options, tableLocation);

    // set an invalid value for 'fs.default.name' in Spark Hadoop config
    // to verify that 'hadoop.' data source options are propagated correctly
    sparkHadoopConf.set("fs.default.name", "hdfs://localhost:9000");

    List<SimpleRecord> expectedRecords = Lists.newArrayList(
        new SimpleRecord(1, "a"),
        new SimpleRecord(2, "b")
    );
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write()
        .format("iceberg")
        .mode("append")
        .option("hadoop.fs.default.name", "file:///")
        .save(tableLocation);

    Dataset<Row> resultDf = spark.read()
        .format("iceberg")
        .option("hadoop.fs.default.name", "file:///")
        .load(tableLocation);
    List<SimpleRecord> resultRecords = resultDf.orderBy("id")
        .as(Encoders.bean(SimpleRecord.class))
        .collectAsList();

    Assert.assertEquals("Records should match", expectedRecords, resultRecords);
  } finally {
    sparkHadoopConf.set("fs.default.name", originalDefaultFS);
  }
}
 
Example 27
@Test(expected = EsHadoopIllegalArgumentException.class)
public void test0FailOnIndexCreationDisabled() throws Exception {
    String target = wrapIndex(resource("test-nonexisting", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .expectingToThrow(EsHadoopIllegalArgumentException.class)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .option(ES_INDEX_AUTO_CREATE, "no")
                    .format("es"),
            target
    );

    assertTrue(!RestUtils.exists(target));
}
 
Example 28
Source Project: iceberg   Source File: TestPartitionValues.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testNullPartitionValue() throws Exception {
  String desc = "null_part";
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c"),
      new SimpleRecord(4, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(location.toString());

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result
      .orderBy("id")
      .as(Encoders.bean(SimpleRecord.class))
      .collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 29
Source Project: iceberg   Source File: TestPartitionValues.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testReorderedColumns() throws Exception {
  String desc = "reorder_columns";
  File parent = temp.newFolder(desc);
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs());

  HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
  Table table = tables.create(SIMPLE_SCHEMA, SPEC, location.toString());
  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  List<SimpleRecord> expected = Lists.newArrayList(
          new SimpleRecord(1, "a"),
          new SimpleRecord(2, "b"),
          new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("data", "id").write()
          .format("iceberg")
          .mode("append")
          .option("check-ordering", "false")
          .save(location.toString());

  Dataset<Row> result = spark.read()
          .format("iceberg")
          .load(location.toString());

  List<SimpleRecord> actual = result
          .orderBy("id")
          .as(Encoders.bean(SimpleRecord.class))
          .collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example 30
Source Project: iceberg   Source File: TestSparkDataWrite.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedOverwrite() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  // overwrite with the same data; should not produce two copies
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("overwrite")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}