org.apache.spark.sql.types.StructType#fieldNames

Source File: DataFrames.java From DataVec with Apache License 2.0

5 votes

/**
 * Create a datavec schema
 * from a struct type
 *
 * @param structType the struct type to create the schema from
 * @return the created schema
 */
public static Schema fromStructType(StructType structType) {
    Schema.Builder builder = new Schema.Builder();
    StructField[] fields = structType.fields();
    String[] fieldNames = structType.fieldNames();
    for (int i = 0; i < fields.length; i++) {
        String name = fields[i].dataType().typeName().toLowerCase();
        switch (name) {
            case "double":
                builder.addColumnDouble(fieldNames[i]);
                break;
            case "float":
                builder.addColumnFloat(fieldNames[i]);
                break;
            case "long":
                builder.addColumnLong(fieldNames[i]);
                break;
            case "int":
            case "integer":
                builder.addColumnInteger(fieldNames[i]);
                break;
            case "string":
                builder.addColumnString(fieldNames[i]);
                break;
            default:
                throw new RuntimeException("Unknown type: " + name);
        }
    }

    return builder.build();
}

Source File: MorphlineUtils.java From envelope with Apache License 2.0

5 votes

@SuppressWarnings("serial")
public static FlatMapFunction<Row, Row> morphlineMapper(final String morphlineFile, final String morphlineId,
                                                        final StructType outputSchema, final boolean errorOnEmpty) {
  return new FlatMapFunction<Row, Row>() {
    @Override
    public Iterator<Row> call(Row row) throws Exception {
      // Retrieve the Command pipeline via ThreadLocal
      Pipeline pipeline = MorphlineUtils.getPipeline(morphlineFile, morphlineId);

      if (null == pipeline) {
        pipeline = MorphlineUtils.setPipeline(morphlineFile, morphlineId, new Collector(), true);
      }

      // Convert each Row into a Record
      StructType inputSchema = row.schema();
      if (null == inputSchema) {
        throw new RuntimeException("Row does not have an associated StructType schema");
      }

      Record inputRecord = new Record();
      String[] fieldNames = inputSchema.fieldNames();

      // TODO : Confirm nested object conversion
      for (int i = 0; i < fieldNames.length; i++) {
        inputRecord.put(fieldNames[i], row.get(i));
      }

      // Process each Record via the Command pipeline
      List<Record> outputRecords = MorphlineUtils.executePipeline(pipeline, inputRecord, errorOnEmpty);

      // Convert each Record into a new Row
      List<Row> outputRows = Lists.newArrayListWithCapacity(outputRecords.size());
      for (Record record : outputRecords) {
        outputRows.add(MorphlineUtils.convertToRow(outputSchema, record));
      }

      return outputRows.iterator();
    }
  };
}

Source File: TestMorphlineUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void morphlineMapper(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}

Source File: TestMorphlineUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void morphlineMapperNoPipeline(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = null; times = 1;
    MorphlineUtils.setPipeline("file", "id", (MorphlineUtils.Collector) any, true); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}

Source File: RowUtils.java From envelope with Apache License 2.0

5 votes

public static Row subsetRow(Row row, StructType subsetSchema) {
  Object[] values = new Object[subsetSchema.length()];

  int i = 0;
  for (String fieldName : subsetSchema.fieldNames()) {
    values[i] = row.get(row.fieldIndex(fieldName));
    i++;
  }

  Row subset = new RowWithSchema(subsetSchema, values);

  return subset;
}

Source File: TestRowUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void testToRowValueMapRowNested(
    final @Mocked Row inputRow,
    final @Mocked StructType innerSchema,
    final @Mocked StructType outerSchema
) {
  DataType field = DataTypes.createMapType(DataTypes.StringType,
      DataTypes.createMapType(DataTypes.StringType, DataTypes.IntegerType, true)
  );

  Map<Object, Object> expectedInnerMap = Maps.newHashMap();
  expectedInnerMap.put("field1", 1);
  expectedInnerMap.put("field2", 2);

  Map<Object, Object> expectedOuterMap = Maps.newHashMap();
  expectedOuterMap.put("outer", expectedInnerMap);

  new Expectations() {{
    inputRow.schema(); returns(outerSchema, innerSchema);

    outerSchema.fieldNames(); result = new String[] {"outer"};
    innerSchema.fieldNames(); result = new String[] {"field1", "field2"};

    inputRow.get(0); returns(inputRow, 1);
    inputRow.get(1); result = 2;
  }};

  assertEquals("Invalid list of values", expectedOuterMap, RowUtils.toRowValue(inputRow, field));
}

Source File: DataFrames.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Create a datavec schema
 * from a struct type
 *
 * @param structType the struct type to create the schema from
 * @return the created schema
 */
public static Schema fromStructType(StructType structType) {
    Schema.Builder builder = new Schema.Builder();
    StructField[] fields = structType.fields();
    String[] fieldNames = structType.fieldNames();
    for (int i = 0; i < fields.length; i++) {
        String name = fields[i].dataType().typeName().toLowerCase();
        switch (name) {
            case "double":
                builder.addColumnDouble(fieldNames[i]);
                break;
            case "float":
                builder.addColumnFloat(fieldNames[i]);
                break;
            case "long":
                builder.addColumnLong(fieldNames[i]);
                break;
            case "int":
            case "integer":
                builder.addColumnInteger(fieldNames[i]);
                break;
            case "string":
                builder.addColumnString(fieldNames[i]);
                break;
            default:
                throw new RuntimeException("Unknown type: " + name);
        }
    }

    return builder.build();
}

Source File: SchemaIntrospectionApp.java From net.jgp.labs.spark with Apache License 2.0

4 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Array to Dataframe (Dataset<Row>)")
      .master("local")
      .getOrCreate();

  StructType schema = DataTypes.createStructType(new StructField[] {
      DataTypes.createStructField(
          "id",
          DataTypes.IntegerType,
          false),
      DataTypes.createStructField(
          "value-s",
          DataTypes.StringType,
          false),
      DataTypes.createStructField(
          "value-d",
          DataTypes.DoubleType,
          false),
      DataTypes.createStructField(
          "array",
          DataTypes.createArrayType(DataTypes.StringType, false),
          false),
      DataTypes.createStructField(
          "struct",
          DataTypes.createStructType(new StructField[] {
              DataTypes.createStructField(
                  "sid",
                  DataTypes.IntegerType,
                  false),
              DataTypes.createStructField(
                  "svalue",
                  DataTypes.StringType,
                  false) }),
          false),
      DataTypes.createStructField(
          "array-struct",
          DataTypes.createArrayType(
              DataTypes.createStructType(new StructField[] {
                  DataTypes.createStructField(
                      "asid",
                      DataTypes.IntegerType,
                      false),
                  DataTypes.createStructField(
                      "asvalue",
                      DataTypes.StringType,
                      false) })),
          false) });

  List<Row> rows = new ArrayList<>();
  for (int x = 0; x < 10; x++) {
    List<Row> subrows = new ArrayList<>();
    for (int y = 1000; y < 1003; y++) {
      subrows.add(RowFactory.create(y, "Sub " + y));
    }
    Row str = RowFactory.create(x * 5000, "Struct #" + x);
    String[] array =
        new String[] { "v" + (x * 100), "v" + (x * 100 + 1) };
    rows.add(
        RowFactory.create(x, "Value " + x, x / 4.0, array, str, subrows));
  }

  Dataset<Row> df = spark.createDataFrame(rows, schema);
  df.show(false);
  df.printSchema();

  StructType readSchema = df.schema();
  String[] fieldNames = readSchema.fieldNames();
  int i = 0;
  for (String fieldName : fieldNames) {
    log.info("Field #{}: '{}'", i++, fieldName);
  }
  log.info("Catalog: '{}'", readSchema.catalogString());
  StructField[] fields = readSchema.fields();
  i = 0;
  for (StructField field : fields) {
    log.info("DDL for field #{}: '{}'", i++, field.toDDL());
  }
}

Java Code Examples for org.apache.spark.sql.types.StructType#fieldNames()