Java Code Examples for org.apache.spark.sql.types.StructType

The following examples show how to use org.apache.spark.sql.types.StructType. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: envelope   Source File: ProtobufUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Retrieves and converts Protobuf fields from a Message.
 * <p>
 * If the field in the {@link com.google.protobuf.Descriptors.Descriptor} exists in the {@link Message}, the value is
 * retrieved and converted using {@link #getFieldValue(Descriptors.FieldDescriptor, Object, DataType)}.
 * Otherwise, the field value is {@code null}.
 * The extraction honors the order of the {@code Descriptor}.
 *
 * @param dsc the Protobuf Descriptor with all fields
 * @param msg the Message with the current field values
 * @param schema the Dataset schema derived from the Descriptor
 * @return a list of converted values
 */
public static List<Object> buildRowValues(Descriptors.Descriptor dsc, Message msg, StructType schema) {
  List<Object> values = new ArrayList<>();
  Object val;

  for (Descriptors.FieldDescriptor fd : dsc.getFields()) {
    if ( (!fd.isRepeated() && msg.hasField(fd)) || (fd.isRepeated() && msg.getRepeatedFieldCount(fd) > 0) ) {
      val = getFieldValue(fd, msg.getField(fd), schema.apply(fd.getName()).dataType());
    } else {
      LOG.trace("FieldDescriptor[{}] => not found", fd.getFullName());
      val = null;
    }
    values.add(val);
  }

  return values;
}
 
Example 2
public BigQueryDataSourceReader(
        TableInfo table,
        BigQueryClient bigQueryClient,
        BigQueryReadClientFactory bigQueryReadClientFactory,
        ReadSessionCreatorConfig readSessionCreatorConfig,
        Optional<String> globalFilter,
        Optional<StructType> schema) {
    this.table = table;
    this.tableId = table.getTableId();
    this.readSessionCreatorConfig = readSessionCreatorConfig;
    this.bigQueryClient = bigQueryClient;
    this.bigQueryReadClientFactory = bigQueryReadClientFactory;
    this.readSessionCreator = new ReadSessionCreator(readSessionCreatorConfig, bigQueryClient, bigQueryReadClientFactory);
    this.globalFilter = globalFilter;
    this.schema = schema;
}
 
Example 3
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");

	List<Vector> list = new ArrayList<>();
	list.add(Vectors.dense(1.0, 2.0, 3.0));
	list.add(Vectors.dense(4.0, 5.0, 6.0));
	list.add(Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 4
Source Project: iceberg   Source File: ParquetWithSparkSchemaVisitor.java    License: Apache License 2.0 6 votes vote down vote up
private static <T> List<T> visitFields(StructType struct, GroupType group,
                                       ParquetWithSparkSchemaVisitor<T> visitor) {
  StructField[] sFields = struct.fields();
  Preconditions.checkArgument(sFields.length == group.getFieldCount(),
      "Structs do not match: %s and %s", struct, group);
  List<T> results = Lists.newArrayListWithExpectedSize(group.getFieldCount());
  for (int i = 0; i < sFields.length; i += 1) {
    Type field = group.getFields().get(i);
    StructField sField = sFields[i];
    Preconditions.checkArgument(field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())),
        "Structs do not match: field %s != %s", field.getName(), sField.name());
    results.add(visitField(sField, field, visitor));
  }

  return results;
}
 
Example 5
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example 6
Source Project: spark-llap   Source File: SchemaUtilTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testBuildHiveCreateTableQueryFromSparkDFSchema() {

  HiveWarehouseSessionState sessionState =
      HiveWarehouseBuilder
          .session(session)
          .userPassword(TEST_USER, TEST_PASSWORD)
          .hs2url(TEST_HS2_URL)
          .dbcp2Conf(TEST_DBCP2_CONF)
          .maxExecResults(TEST_EXEC_RESULTS_MAX)
          .defaultDB(TEST_DEFAULT_DB)
          .sessionStateForTest();
  HiveWarehouseSession hive = new MockHiveWarehouseSessionImpl(sessionState);

  HiveWarehouseSessionImpl.HIVE_WAREHOUSE_CONNECTOR_INTERNAL = "com.hortonworks.spark.sql.hive.llap.MockHiveWarehouseConnector";

  StructType schema = getSchema();
  String query = SchemaUtil.buildHiveCreateTableQueryFromSparkDFSchema(schema, "testDB", "testTable");
  System.out.println("create table query:" + query);
  assertTrue(hive.executeUpdate(query));
}
 
Example 7
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}
 
Example 8
@Benchmark
@Threads(1)
public void readWithProjectionUsingSparkReader(Blackhole blackhole) throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(PROJECTED_SCHEMA);
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(PROJECTED_SCHEMA)
      .readSupport(new ParquetReadSupport())
      .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json())
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .callInit()
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}
 
Example 9
Source Project: envelope   Source File: TestMorphlineTranslator.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void getSchema() throws Exception {

  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(MorphlineTranslator.ENCODING_KEY, "UTF-8");
  configMap.put(MorphlineTranslator.ENCODING_MSG, "UTF-8");
  configMap.put(MorphlineTranslator.MORPHLINE, getResourcePath(MORPHLINE_FILE));
  configMap.put(MorphlineTranslator.MORPHLINE_ID, "default");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("bar", "foo"));
  configMap.put(MorphlineTranslator.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int", "string"));
  Config config = ConfigFactory.parseMap(configMap);

  translator.configure(config);
  StructType schema = translator.getProvidingSchema();

  Assert.assertEquals("Invalid number of SchemaFields", 2, schema.fields().length);
  Assert.assertEquals("Invalid DataType", DataTypes.IntegerType, schema.fields()[0].dataType());
  Assert.assertEquals("Invalid DataType", DataTypes.StringType, schema.fields()[1].dataType());
}
 
Example 10
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 11
Source Project: net.jgp.labs.spark   Source File: RowProcessor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void call(JavaRDD<String> rdd) throws Exception {

  JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
    private static final long serialVersionUID = 5167089361335095997L;

    @Override
    public Row call(String msg) {
      Row row = RowFactory.create(msg);
      return row;
    }
  });
  // Create Schema
  StructType schema = DataTypes.createStructType(
      new StructField[] { DataTypes.createStructField("Message",
          DataTypes.StringType, true) });

  // Get Spark 2.0 session
  SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context()
      .getConf());
  Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
  msgDataFrame.show();
}
 
Example 12
Source Project: hudi   Source File: HoodieReadClient.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
 *
 * @return a dataframe
 */
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
  assertSqlContext();
  JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD =
      index.fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
  JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
      lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
  List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
      .map(keyFileTuple -> keyFileTuple._2().get()).collect();

  // record locations might be same for multiple keys, so need a unique list
  Set<String> uniquePaths = new HashSet<>(paths);
  Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
  StructType schema = originalDF.schema();
  JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
    HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
        row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
    return new Tuple2<>(key, row);
  });

  // Now, we need to further filter out, for only rows that match the supplied hoodie keys
  JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
  return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}
 
Example 13
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 6 votes vote down vote up
private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}
 
Example 14
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 15
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");

	List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>();
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
	list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
	JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 16
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified");

	List<String> list = new ArrayList<>();
	list.add("2,2,2");
	list.add("3,3,3");
	list.add("4,4,4");
	JavaRDD<String> javaRddString = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
	setExpectedStdOut("sum: 27.0");
	ml.execute(script);
}
 
Example 17
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
	System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");

	List<Tuple2<Double, Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}
 
Example 18
Source Project: envelope   Source File: TestRowUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testToRowValueMapRowNested(
    final @Mocked Row inputRow,
    final @Mocked StructType innerSchema,
    final @Mocked StructType outerSchema
) {
  DataType field = DataTypes.createMapType(DataTypes.StringType,
      DataTypes.createMapType(DataTypes.StringType, DataTypes.IntegerType, true)
  );

  Map<Object, Object> expectedInnerMap = Maps.newHashMap();
  expectedInnerMap.put("field1", 1);
  expectedInnerMap.put("field2", 2);

  Map<Object, Object> expectedOuterMap = Maps.newHashMap();
  expectedOuterMap.put("outer", expectedInnerMap);

  new Expectations() {{
    inputRow.schema(); returns(outerSchema, innerSchema);

    outerSchema.fieldNames(); result = new String[] {"outer"};
    innerSchema.fieldNames(); result = new String[] {"field1", "field2"};

    inputRow.get(0); returns(inputRow, 1);
    inputRow.get(1); result = 2;
  }};

  assertEquals("Invalid list of values", expectedOuterMap, RowUtils.toRowValue(inputRow, field));
}
 
Example 19
Source Project: hudi   Source File: CsvDFSSource.java    License: Apache License 2.0 5 votes vote down vote up
public CsvDFSSource(TypedProperties props,
    JavaSparkContext sparkContext,
    SparkSession sparkSession,
    SchemaProvider schemaProvider) {
  super(props, sparkContext, sparkSession, schemaProvider);
  this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
  if (schemaProvider != null) {
    sourceSchema = (StructType) SchemaConverters.toSqlType(schemaProvider.getSourceSchema())
        .dataType();
  } else {
    sourceSchema = null;
  }
}
 
Example 20
Source Project: iceberg   Source File: SparkTable.java    License: Apache License 2.0 5 votes vote down vote up
public SparkTable(Table icebergTable, StructType requestedSchema) {
  this.icebergTable = icebergTable;
  this.requestedSchema = requestedSchema;

  if (requestedSchema != null) {
    // convert the requested schema to throw an exception if any requested fields are unknown
    SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema);
  }
}
 
Example 21
Source Project: envelope   Source File: KafkaInput.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void receiveExpectedSchema(StructType expectedSchema) {
  this.expectedSchema = expectedSchema;

  List<String> fieldNames = Lists.newArrayList(KEY_FIELD_NAME, Translator.VALUE_FIELD_NAME);

  for (String fieldName : fieldNames) {
    if (Lists.newArrayList(expectedSchema.fieldNames()).contains(fieldName)) {
      DataType fieldDataType = expectedSchema.fields()[expectedSchema.fieldIndex(fieldName)].dataType();

      if (fieldDataType.equals(DataTypes.StringType)) {
        kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
      }
      else if (fieldDataType.equals(DataTypes.BinaryType)) {
        kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
      }
      else {
        throw new RuntimeException("Translator expects '" + fieldName + "' field to be of type '" + fieldDataType +
            "' but Kafka input only supports providing '" + fieldName + "' field as either string or   binary.");
      }
    }
    else {
      // If the translator doesn't expect the field then provide it as binary
      kafkaParams.put(fieldName + ".deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    }
  }
}
 
Example 22
Source Project: spark-llap   Source File: HiveWarehouseConnector.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType schema,
    SaveMode mode, DataSourceOptions options) {
  Map<String, String> params = getOptions(options);
  String stagingDirPrefix = HWConf.LOAD_STAGING_DIR.getFromOptionsMap(params);
  Path path = new Path(stagingDirPrefix);
  Configuration conf = SparkSession.getActiveSession().get().sparkContext().hadoopConfiguration();
  return Optional.of(getDataSourceWriter(jobId, schema, path, params, conf));
}
 
Example 23
Source Project: iceberg   Source File: IcebergSource.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SparkTable getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
  // TODO: if partitioning is non-null, the table is being created?
  // Get Iceberg table from options
  Configuration conf = new Configuration(SparkSession.active().sparkContext().hadoopConfiguration());
  Table icebergTable = getTableAndResolveHadoopConfiguration(options, conf);

  // Build Spark table based on Iceberg table, and return it
  return new SparkTable(icebergTable, schema);
}
 
Example 24
Source Project: envelope   Source File: TestMorphlineDeriver.java    License: Apache License 2.0 5 votes vote down vote up
@Test (expected = RuntimeException.class)
public void deriveMorphlineMapperFunctionError(
    final @Mocked MorphlineUtils utils
) throws Exception {
  Map<String, Object> paramMap = new HashMap<>();
  paramMap.put(MorphlineDeriver.STEP_NAME_CONFIG, "dep1");
  paramMap.put(MorphlineDeriver.MORPHLINE, "morphline");
  paramMap.put(MorphlineDeriver.MORPHLINE_ID, "id");
  paramMap.put(MorphlineDeriver.SCHEMA_CONFIG + "." + ComponentFactory.TYPE_CONFIG_NAME, "flat");
  paramMap.put(MorphlineDeriver.SCHEMA_CONFIG + "." + FlatSchema.FIELD_NAMES_CONFIG,
      Lists.newArrayList("bar"));
  paramMap.put(MorphlineDeriver.SCHEMA_CONFIG + "." + FlatSchema.FIELD_TYPES_CONFIG,
      Lists.newArrayList("int"));
  final Config config = ConfigFactory.parseMap(paramMap);

  new Expectations() {{
    MorphlineUtils.morphlineMapper(anyString, anyString, (StructType) any, true); result =
        new MorphlineCompilationException("Compile exception", config);
  }};

  Dataset<Row> dataFrame = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create(1)),
      DataTypes.createStructType(Lists.newArrayList(DataTypes.createStructField(
          "baz", DataTypes.IntegerType, false)))
  );

  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dataFrame);

  MorphlineDeriver deriver = new MorphlineDeriver();
  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  deriver.derive(dependencies);
}
 
Example 25
Source Project: envelope   Source File: TestPivotDeriver.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testIntegerDataType() throws Exception {
  List<Row> sourceList = Lists.newArrayList(
      RowFactory.create("A", "hello", 1),
      RowFactory.create("A", "world", 2),
      RowFactory.create("B", "hello", 3),
      RowFactory.create("C", "world", 4));
  StructType schema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("entity_id", DataTypes.StringType, true),
    DataTypes.createStructField("key", DataTypes.StringType, true),
    DataTypes.createStructField("value", DataTypes.IntegerType, true)
  ));
  Dataset<Row> source = Contexts.getSparkSession().createDataFrame(sourceList, schema);

  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("source", source);
  
  Config config = ConfigFactory.empty()
      .withValue(PivotDeriver.STEP_NAME_CONFIG, ConfigValueFactory.fromAnyRef("source"))
      .withValue(PivotDeriver.ENTITY_KEY_FIELD_NAMES_CONFIG, ConfigValueFactory.fromAnyRef(Lists.newArrayList("entity_id")))
      .withValue(PivotDeriver.PIVOT_KEY_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("key"))
      .withValue(PivotDeriver.PIVOT_VALUE_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("value"));

  PivotDeriver d = new PivotDeriver();
  assertNoValidationFailures(d, config);
  d.configure(config);
  
  List<Row> results = d.derive(dependencies).collectAsList();
  
  assertEquals(results.size(), 3);
  assertTrue(results.contains(RowFactory.create("A", 1, 2)));
  assertTrue(results.contains(RowFactory.create("B", 3, null)));
  assertTrue(results.contains(RowFactory.create("C", null, 4)));
}
 
Example 26
Source Project: SparkDemo   Source File: JavaElementwiseProductExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaElementwiseProductExample")
    .getOrCreate();

  // $example on$
  // Create some vector data; also works for sparse vectors
  List<Row> data = Arrays.asList(
    RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
    RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
  );

  List<StructField> fields = new ArrayList<StructField>(2);
  fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
  fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));

  StructType schema = DataTypes.createStructType(fields);

  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);

  ElementwiseProduct transformer = new ElementwiseProduct()
    .setScalingVec(transformingVector)
    .setInputCol("vector")
    .setOutputCol("transformedVector");

  // Batch transform the vectors to create new column:
  transformer.transform(dataFrame).show();
  // $example off$
  spark.stop();
}
 
Example 27
Source Project: SparkDemo   Source File: JavaBinarizerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example 28
Source Project: systemds   Source File: FrameRDDConverterUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This function will convert Frame schema into DataFrame schema 
 * 
 * @param fschema frame schema
 * @param containsID true if contains ID column
 * @return Spark StructType of StructFields representing schema
 */
public static StructType convertFrameSchemaToDFSchema(ValueType[] fschema, boolean containsID)
{
	// generate the schema based on the string of schema
	List<StructField> fields = new ArrayList<>();
	
	// add id column type
	if( containsID )
		fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, 
				DataTypes.DoubleType, true));
	
	// add remaining types
	int col = 1;
	for (ValueType schema : fschema) {
		DataType dt = null;
		switch(schema) {
			case STRING:  dt = DataTypes.StringType; break;
			case FP64:  dt = DataTypes.DoubleType; break;
			case INT64:     dt = DataTypes.LongType; break;
			case BOOLEAN: dt = DataTypes.BooleanType; break;
			default:      dt = DataTypes.StringType;
				LOG.warn("Using default type String for " + schema.toString());
		}
		fields.add(DataTypes.createStructField("C"+col++, dt, true));
	}
	
	return DataTypes.createStructType(fields);
}
 
Example 29
Source Project: iceberg   Source File: Reader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void pruneColumns(StructType requestedSchema) {
  this.requestedSchema = requestedSchema;

  // invalidate the schema that will be projected
  this.schema = null;
  this.type = null;
}
 
Example 30
Source Project: DataVec   Source File: DataFrames.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create a datavec schema
 * from a struct type
 *
 * @param structType the struct type to create the schema from
 * @return the created schema
 */
public static Schema fromStructType(StructType structType) {
    Schema.Builder builder = new Schema.Builder();
    StructField[] fields = structType.fields();
    String[] fieldNames = structType.fieldNames();
    for (int i = 0; i < fields.length; i++) {
        String name = fields[i].dataType().typeName().toLowerCase();
        switch (name) {
            case "double":
                builder.addColumnDouble(fieldNames[i]);
                break;
            case "float":
                builder.addColumnFloat(fieldNames[i]);
                break;
            case "long":
                builder.addColumnLong(fieldNames[i]);
                break;
            case "int":
            case "integer":
                builder.addColumnInteger(fieldNames[i]);
                break;
            case "string":
                builder.addColumnString(fieldNames[i]);
                break;
            default:
                throw new RuntimeException("Unknown type: " + name);
        }
    }

    return builder.build();
}