Java Code Examples for org.apache.spark.sql.Encoder

The following examples show how to use org.apache.spark.sql.Encoder. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: beam   Source File: EncoderHelpers.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Wrap a Beam coder into a Spark Encoder using Catalyst Expression Encoders (which uses java code
 * generation).
 */
public static <T> Encoder<T> fromBeamCoder(Coder<T> coder) {
  Class<? super T> clazz = coder.getEncodedTypeDescriptor().getRawType();
  ClassTag<T> classTag = ClassTag$.MODULE$.apply(clazz);
  List<Expression> serializers =
      Collections.singletonList(
          new EncodeUsingBeamCoder<>(new BoundReference(0, new ObjectType(clazz), true), coder));

  return new ExpressionEncoder<>(
      SchemaHelpers.binarySchema(),
      false,
      JavaConversions.collectionAsScalaIterable(serializers).toSeq(),
      new DecodeUsingBeamCoder<>(
          new Cast(new GetColumnByOrdinal(0, BinaryType), BinaryType), classTag, coder),
      classTag);
}
 
Example 2
Source Project: sparkResearch   Source File: DataSetApplication.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}
 
Example 3
/**
 * Main function.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark SQL user-defined Datasets aggregation example")
    .getOrCreate();

  Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
  String path = args[0];
  Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
  ds.show();
  // +-------+------+
  // |   name|salary|
  // +-------+------+
  // |Michael|  3000|
  // |   Andy|  4500|
  // | Justin|  3500|
  // |  Berta|  4000|
  // +-------+------+

  MyAverage myAverage = new MyAverage();
  // Convert the function to a `TypedColumn` and give it a name
  TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
  Dataset<Double> result = ds.select(averageSalary);
  result.show();
  // +--------------+
  // |average_salary|
  // +--------------+
  // |        3750.0|
  // +--------------+
  spark.stop();
}
 
Example 4
Source Project: incubator-nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final java.util.List<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 5
Source Project: incubator-nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final RDD<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 6
Source Project: incubator-nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final scala.collection.Seq<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 7
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> as(final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(evidence);
  final Dataset<U> result = from(super.as(evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 8
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final scala.Function1<T, U> func, final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(func, evidence);
  final Dataset<U> result = from(super.map(func, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 9
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final MapFunction<T, U> func, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(func, encoder);
  final Dataset<U> result = from(super.map(func, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 10
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(
  final scala.Function1<scala.collection.Iterator<T>, scala.collection.Iterator<U>> func,
  final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(func, evidence);
  final Dataset<U> result = from(super.mapPartitions(func, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 11
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 12
Source Project: learning-spark-with-java   Source File: JavaBean.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("Dataset-JavaBean")
        .master("local[4]")
        .getOrCreate();

    //
    // The Java API requires you to explicitly instantiate an encoder for
    // any JavaBean you want to use for schema inference
    //
    Encoder<Number> numberEncoder = Encoders.bean(Number.class);
    //
    // Create a container of the JavaBean instances
    //
    List<Number> data = Arrays.asList(
            new Number(1, "one", "un"),
            new Number(2, "two", "deux"),
            new Number(3, "three", "trois"));
    //
    // Use the encoder and the container of JavaBean instances to create a
    // Dataset
    //
    Dataset<Number> ds = spark.createDataset(data, numberEncoder);

    System.out.println("*** here is the schema inferred from the bean");
    ds.printSchema();

    System.out.println("*** here is the data");
    ds.show();

    // Use the convenient bean-inferred column names to query
    System.out.println("*** filter by one column and fetch others");
    ds.where(col("i").gt(2)).select(col("english"), col("french")).show();

    spark.stop();
}
 
Example 13
Source Project: nemo   Source File: JavaUserDefinedTypedAggregation.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Main function.
 * @param args arguments.
 */
public static void main(final String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Java Spark SQL user-defined Datasets aggregation example")
      .getOrCreate();

  Encoder<Employee> employeeEncoder = Encoders.bean(Employee.class);
  String path = args[0];
  Dataset<Employee> ds = spark.read().json(path).as(employeeEncoder);
  ds.show();
  // +-------+------+
  // |   name|salary|
  // +-------+------+
  // |Michael|  3000|
  // |   Andy|  4500|
  // | Justin|  3500|
  // |  Berta|  4000|
  // +-------+------+

  MyAverage myAverage = new MyAverage();
  // Convert the function to a `TypedColumn` and give it a name
  TypedColumn<Employee, Double> averageSalary = myAverage.toColumn().name("average_salary");
  Dataset<Double> result = ds.select(averageSalary);
  result.show();
  // +--------------+
  // |average_salary|
  // +--------------+
  // |        3750.0|
  // +--------------+
  spark.stop();
}
 
Example 14
Source Project: nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final java.util.List<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 15
Source Project: nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final RDD<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 16
Source Project: nemo   Source File: SparkSession.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <T> Dataset<T> createDataset(final scala.collection.Seq<T> data, final Encoder<T> evidence) {
  final boolean userTriggered = initializeFunction(data, evidence);
  final Dataset<T> result = Dataset.from(super.createDataset(data, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 17
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> as(final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(evidence);
  final Dataset<U> result = from(super.as(evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 18
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final scala.Function1<T, U> func, final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(func, evidence);
  final Dataset<U> result = from(super.map(func, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 19
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> map(final MapFunction<T, U> func, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(func, encoder);
  final Dataset<U> result = from(super.map(func, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 20
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(
    final scala.Function1<scala.collection.Iterator<T>, scala.collection.Iterator<U>> func,
    final Encoder<U> evidence) {
  final boolean userTriggered = initializeFunction(func, evidence);
  final Dataset<U> result = from(super.mapPartitions(func, evidence));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 21
Source Project: nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public <U> Dataset<U> mapPartitions(final MapPartitionsFunction<T, U> f, final Encoder<U> encoder) {
  final boolean userTriggered = initializeFunction(f, encoder);
  final Dataset<U> result = from(super.mapPartitions(f, encoder));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 22
Source Project: iceberg   Source File: TestForwardCompatibility24.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected <T> MemoryStream<T> newMemoryStream(int id, SQLContext sqlContext, Encoder<T> encoder) {
  return new MemoryStream<>(id, sqlContext, encoder);
}
 
Example 23
Source Project: iceberg   Source File: TestStructuredStreaming24.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected <T> MemoryStream<T> newMemoryStream(int id, SQLContext sqlContext, Encoder<T> encoder) {
  return new MemoryStream<>(id, sqlContext, encoder);
}
 
Example 24
Source Project: iceberg   Source File: TestForwardCompatibility3.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected <T> MemoryStream<T> newMemoryStream(int id, SQLContext sqlContext, Encoder<T> encoder) {
  return new MemoryStream<>(id, sqlContext, Option.empty(), encoder);
}
 
Example 25
Source Project: iceberg   Source File: TestStructuredStreaming3.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected <T> MemoryStream<T> newMemoryStream(int id, SQLContext sqlContext, Encoder<T> encoder) {
  return new MemoryStream<>(id, sqlContext, Option.empty(), encoder);
}
 
Example 26
Source Project: Apache-Spark-2x-for-Java-Developers   Source File: TypeSafeUDAF.java    License: MIT License 4 votes vote down vote up
public Encoder<Average> bufferEncoder() {
	return Encoders.bean(Average.class);
}
 
Example 27
Source Project: Apache-Spark-2x-for-Java-Developers   Source File: TypeSafeUDAF.java    License: MIT License 4 votes vote down vote up
public Encoder<Double> outputEncoder() {
	return Encoders.DOUBLE();
}
 
Example 28
Source Project: SparkDemo   Source File: JavaSparkSQLExample.java    License: MIT License 4 votes vote down vote up
private static void runDatasetCreationExample(SparkSession spark) {
  // $example on:create_ds$
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
    @Override
    public Integer call(Integer value) throws Exception {
      return value + 1;
    }
  }, integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json";
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_ds$
}
 
Example 29
Source Project: learning-spark-with-java   Source File: Basic.java    License: MIT License 4 votes vote down vote up
public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("Dataset-Basic")
        .master("local[4]")
        .getOrCreate();

    List<Integer> data = Arrays.asList(10, 11, 12, 13, 14, 15);
    Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());

    System.out.println("*** only one column, and it always has the same name");
    ds.printSchema();

    ds.show();

    System.out.println("*** values > 12");

    // the harder way to filter
    Dataset<Integer> ds2 = ds.filter((Integer value) -> value > 12);

    ds.show();

    List<Tuple3<Integer, String, String>> tuples =
        Arrays.asList(
            new Tuple3<>(1, "one", "un"),
            new Tuple3<>(2, "two", "deux"),
            new Tuple3<>(3, "three", "trois"));

    Encoder<Tuple3<Integer, String, String>> encoder =
        Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING());

    Dataset<Tuple3<Integer, String, String>> tupleDS =
        spark.createDataset(tuples, encoder);

    System.out.println("*** Tuple Dataset types");
    tupleDS.printSchema();

    // the tuple columns have unfriendly names, but you can use them to query
    System.out.println("*** filter by one column and fetch another");
    tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show();

    spark.stop();
}
 
Example 30
Source Project: envelope   Source File: DatasetRowRuleWrapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Encoder<Row> bufferEncoder() {
  return RowEncoder.apply(SCHEMA);
}