Java Code Examples for org.apache.spark.sql.types.DataTypes#IntegerType

The following examples show how to use org.apache.spark.sql.types.DataTypes#IntegerType . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  configMap.put(RangeRowRule.IGNORE_NULLS_CONFIG, true);
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertTrue("Row should pass rule", rule.check(row1));
}
 
Example 2
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example 3
Source File: JavaBinarizerExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBinarizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 0.1),
    RowFactory.create(1, 0.8),
    RowFactory.create(2, 0.2)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> continuousDataFrame = spark.createDataFrame(data, schema);

  Binarizer binarizer = new Binarizer()
    .setInputCol("feature")
    .setOutputCol("binarized_feature")
    .setThreshold(0.5);

  Dataset<Row> binarizedDataFrame = binarizer.transform(continuousDataFrame);

  System.out.println("Binarizer output with Threshold = " + binarizer.getThreshold());
  binarizedDataFrame.show();
  // $example off$

  spark.stop();
}
 
Example 4
Source File: InstanceRelationWriter.java    From rdf2x with Apache License 2.0 6 votes vote down vote up
private DataType getDataType(int type) {
    switch (type) {
        case LiteralType.BOOLEAN:
            return DataTypes.BooleanType;
        case LiteralType.STRING:
            return DataTypes.StringType;
        case LiteralType.FLOAT:
            return DataTypes.FloatType;
        case LiteralType.DOUBLE:
            return DataTypes.DoubleType;
        case LiteralType.INTEGER:
            return DataTypes.IntegerType;
        case LiteralType.LONG:
            return DataTypes.LongType;
        case LiteralType.DATETIME:
            // datetime not supported due to timezone issues with java.sql.Timestamp
            // check the InstanceAggregator for more info
            return DataTypes.StringType;
    }
    throw new NotImplementedException("Not able to write literal type " + type);
}
 
Example 5
Source File: TypeCastStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private DataType mapDataType(List<StructField> datasetFields, String column, String typeConfig) {

        DataType currentDatatype = getCurrentDataType(datasetFields, column);

        // when typeConfig is null (no config for this column), return the current DataType
        if(typeConfig == null) {
            return currentDatatype;
        }

        switch (typeConfig) {
            case "integer":
                return DataTypes.IntegerType;
            case "long":
                return DataTypes.LongType;
            case "double":
                return DataTypes.DoubleType;
            case "boolean":
                return DataTypes.BooleanType;
            case "date":
                return DataTypes.DateType;
            case "timestamp":
                return DataTypes.TimestampType;
            default:
                return DataTypes.StringType;
        }
    }
 
Example 6
Source File: TestFlatteningTransformer.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testFlatten() {
  FlatteningTransformer transformer = new FlatteningTransformer();

  // Init
  StructField[] nestedStructFields =
      new StructField[] {new StructField("nestedIntColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("nestedStringColumn", DataTypes.StringType, true, Metadata.empty()),};

  StructField[] structFields =
      new StructField[] {new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
          new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty()),
          new StructField("nestedStruct", DataTypes.createStructType(nestedStructFields), true, Metadata.empty())};

  StructType schema = new StructType(structFields);
  String flattenedSql = transformer.flattenSchema(schema, null);

  assertEquals("intColumn as intColumn,stringColumn as stringColumn,"
      + "nestedStruct.nestedIntColumn as nestedStruct_nestedIntColumn,"
      + "nestedStruct.nestedStringColumn as nestedStruct_nestedStringColumn", flattenedSql);
}
 
Example 7
Source File: JavaOneHotEncoderExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneHotEncoderExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  OneHotEncoder encoder = new OneHotEncoder()
    .setInputCol("categoryIndex")
    .setOutputCol("categoryVec");

  Dataset<Row> encoded = encoder.transform(indexed);
  encoded.show();
  // $example off$

  spark.stop();
}
 
Example 8
Source File: DataFrames.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Convert the DataVec sequence schema to a StructType for Spark, for example for use in
 * {@link #toDataFrameSequence(Schema, JavaRDD)}}
 * <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 *
 * @param schema Schema to convert
 * @return StructType for the schema
 */
public static StructType fromSchemaSequence(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns() + 2];

    structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
    structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());

    for (int i = 0; i < schema.numColumns(); i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example 9
Source File: JavaMinMaxScalerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMinMaxScalerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -1.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.1, 1.0)),
      RowFactory.create(2, Vectors.dense(3.0, 10.1, 3.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MinMaxScaler scaler = new MinMaxScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures");

  // Compute summary statistics and generate MinMaxScalerModel
  MinMaxScalerModel scalerModel = scaler.fit(dataFrame);

  // rescale each feature to range [min, max].
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  System.out.println("Features scaled to range: [" + scaler.getMin() + ", "
      + scaler.getMax() + "]");
  scaledData.select("features", "scaledFeatures").show();
  // $example off$

  spark.stop();
}
 
Example 10
Source File: SchemaConverter.java    From geowave with Apache License 2.0 5 votes vote down vote up
private static SimpleFeatureDataType attrDescToDataType(final AttributeDescriptor attrDesc) {
  boolean isGeom = false;
  DataType dataTypeOut = DataTypes.NullType;

  if (attrDesc.getType().getBinding().equals(String.class)) {

    dataTypeOut = DataTypes.StringType;
  } else if (attrDesc.getType().getBinding().equals(Double.class)) {
    dataTypeOut = DataTypes.DoubleType;
  } else if (attrDesc.getType().getBinding().equals(Float.class)) {
    dataTypeOut = DataTypes.FloatType;
  } else if (attrDesc.getType().getBinding().equals(Long.class)) {
    dataTypeOut = DataTypes.LongType;
  } else if (attrDesc.getType().getBinding().equals(Integer.class)) {
    dataTypeOut = DataTypes.IntegerType;
  } else if (attrDesc.getType().getBinding().equals(Boolean.class)) {
    dataTypeOut = DataTypes.BooleanType;
  } else if (attrDesc.getType().getBinding().equals(Date.class)) {
    dataTypeOut = DataTypes.TimestampType;
  }

  // Custom geometry types get WKB encoding
  else if (Geometry.class.isAssignableFrom(attrDesc.getType().getBinding())) {
    dataTypeOut = GeoWaveSpatialEncoders.geometryUDT;
    isGeom = true;
  }

  return new SimpleFeatureDataType(dataTypeOut, isGeom);
}
 
Example 11
Source File: IntegerColumnBlockTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void setPartitionValueTest() {
    IntegerColumnBlock integerColumnBlock = new IntegerColumnBlock(null, DataTypes.IntegerType);
    integerColumnBlock.setPartitionValue("45",1000);
    for (int i = 0; i< 1000; i++) {
        Assert.assertEquals(45,integerColumnBlock.getTestObject(i));
    }
}
 
Example 12
Source File: DataFrames.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert the DataVec sequence schema to a StructType for Spark, for example for use in
 * {@link #toDataFrameSequence(Schema, JavaRDD)}}
 * <b>Note</b>: as per {@link #toDataFrameSequence(Schema, JavaRDD)}}, the StructType has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(Dataset<Row>)}
 *
 * @param schema Schema to convert
 * @return StructType for the schema
 */
public static StructType fromSchemaSequence(Schema schema) {
    StructField[] structFields = new StructField[schema.numColumns() + 2];

    structFields[0] = new StructField(SEQUENCE_UUID_COLUMN, DataTypes.StringType, false, Metadata.empty());
    structFields[1] = new StructField(SEQUENCE_INDEX_COLUMN, DataTypes.IntegerType, false, Metadata.empty());

    for (int i = 0; i < schema.numColumns(); i++) {
        switch (schema.getColumnTypes().get(i)) {
            case Double:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.DoubleType, false, Metadata.empty());
                break;
            case Integer:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.IntegerType, false, Metadata.empty());
                break;
            case Long:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.LongType, false, Metadata.empty());
                break;
            case Float:
                structFields[i + 2] =
                                new StructField(schema.getName(i), DataTypes.FloatType, false, Metadata.empty());
                break;
            default:
                throw new IllegalStateException(
                                "This api should not be used with strings , binary data or ndarrays. This is only for columnar data");
        }
    }
    return new StructType(structFields);
}
 
Example 13
Source File: AvroUtils.java    From envelope with Apache License 2.0 4 votes vote down vote up
/**
 * Convert Avro Types into their associated DataType.
 *
 * @param schemaType Avro Schema.Type
 * @return DataType representation
 */
public static DataType dataTypeFor(Schema schemaType) {
  LOG.trace("Converting Schema[{}] to DataType", schemaType);

  // Unwrap "optional" unions to the base type
  boolean isOptional = isNullable(schemaType);

  if (isOptional) {
    // if only 2 items in the union, then "unwrap," otherwise, it's a full union and should be rendered as such
    if (schemaType.getTypes().size() == 2) {
      LOG.trace("Unwrapping simple 'optional' union for {}", schemaType);
      for (Schema s : schemaType.getTypes()) {
        if (s.getType().equals(NULL)) {
          continue;
        }
        // Unwrap
        schemaType = s;
        break;
      }
    }
  }

  // Convert supported LogicalTypes
  if (null != schemaType.getLogicalType()) {
    LogicalType logicalType = schemaType.getLogicalType();
    switch (logicalType.getName()) {
      case "date" :
        return DataTypes.DateType;
      case "timestamp-millis" :
        return DataTypes.TimestampType;
      case "decimal" :
        LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType;
        return DataTypes.createDecimalType(decimal.getPrecision(), decimal.getScale());
      default:
        // Pass-thru
        LOG.warn("Unsupported LogicalType[{}], continuing with underlying base type", logicalType.getName());
    }
  }

  switch (schemaType.getType()) {
    case RECORD:
      // StructType
      List<StructField> structFieldList = Lists.newArrayListWithCapacity(schemaType.getFields().size());
      for (Field f : schemaType.getFields()) {
        structFieldList.add(DataTypes.createStructField(f.name(), dataTypeFor(f.schema()), isNullable(f.schema())));
      }
      return DataTypes.createStructType(structFieldList);
    case ARRAY:
      Schema elementType = schemaType.getElementType();
      return DataTypes.createArrayType(dataTypeFor(elementType), isNullable(elementType));
    case MAP:
      Schema valueType = schemaType.getValueType();
      return DataTypes.createMapType(DataTypes.StringType, dataTypeFor(valueType), isNullable(valueType));
    case UNION:
      // StructType of members
      List<StructField> unionFieldList = Lists.newArrayListWithCapacity(schemaType.getTypes().size());
      int m = 0;
      for (Schema u : schemaType.getTypes()) {
        unionFieldList.add(DataTypes.createStructField("member" + m++, dataTypeFor(u), isNullable(u)));
      }
      return DataTypes.createStructType(unionFieldList);
    case FIXED:
    case BYTES:
      return DataTypes.BinaryType;
    case ENUM:
    case STRING:
      return DataTypes.StringType;
    case INT:
      return DataTypes.IntegerType;
    case LONG:
      return DataTypes.LongType;
    case FLOAT:
      return DataTypes.FloatType;
    case DOUBLE:
      return DataTypes.DoubleType;
    case BOOLEAN:
      return DataTypes.BooleanType;
    case NULL:
      return DataTypes.NullType;
    default:
      throw new RuntimeException(String.format("Unrecognized or unsupported Avro Type conversion: %s", schemaType));
  }
}
 
Example 14
Source File: JavaInteractionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaInteractionExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(1, 1, 2, 3, 8, 4, 5),
    RowFactory.create(2, 4, 3, 8, 7, 9, 8),
    RowFactory.create(3, 6, 1, 9, 2, 3, 6),
    RowFactory.create(4, 10, 8, 6, 9, 4, 5),
    RowFactory.create(5, 9, 2, 7, 10, 7, 3),
    RowFactory.create(6, 1, 1, 4, 2, 8, 4)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id7", DataTypes.IntegerType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  VectorAssembler assembler1 = new VectorAssembler()
          .setInputCols(new String[]{"id2", "id3", "id4"})
          .setOutputCol("vec1");

  Dataset<Row> assembled1 = assembler1.transform(df);

  VectorAssembler assembler2 = new VectorAssembler()
          .setInputCols(new String[]{"id5", "id6", "id7"})
          .setOutputCol("vec2");

  Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");

  Interaction interaction = new Interaction()
          .setInputCols(new String[]{"id1","vec1","vec2"})
          .setOutputCol("interactedCol");

  Dataset<Row> interacted = interaction.transform(assembled2);

  interacted.show(false);
  // $example off$

  spark.stop();
}
 
Example 15
Source File: BucketizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void bucketizerTest() {
    double[] validData = {-0.5, -0.3, 0.0, 0.2};
    double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0};
    double[] splits = {-0.5, 0.0, 0.5};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, validData[0]),
            cr(1, validData[1]),
            cr(2, validData[2]),
            cr(3, validData[3]));

    DataFrame df = sqlContext.createDataFrame(trainingData, schema);

    Bucketizer sparkModel = new Bucketizer()
            .setInputCol("feature")
            .setOutputCol("result")
            .setSplits(splits);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collect();

    for (Row r : sparkOutput) {
        double input = r.getDouble(1);
        double sparkOp = r.getDouble(2);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), input);
        transformer.transform(data);
        double transformedInput = (double) data.get(sparkModel.getOutputCol());

        assertTrue((transformedInput >= 0) && (transformedInput <= 1));
        assertEquals(transformedInput, sparkOp, EPSILON);
        assertEquals(transformedInput, expectedBuckets[r.getInt(0)], EPSILON);
    }
}
 
Example 16
Source File: ConfigurationDataTypes.java    From envelope with Apache License 2.0 4 votes vote down vote up
public static DataType getSparkDataType(String typeString) {
  DataType type;

  String prec_scale_regex_groups = "\\s*(decimal)\\s*\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)\\s*";
  Pattern prec_scale_regex_pattern = Pattern.compile(prec_scale_regex_groups);
  Matcher prec_scale_regex_matcher = prec_scale_regex_pattern.matcher(typeString);

  if (prec_scale_regex_matcher.matches()) {
    int precision = Integer.parseInt(prec_scale_regex_matcher.group(2)); 
    int scale = Integer.parseInt(prec_scale_regex_matcher.group(3)); 
    type = DataTypes.createDecimalType(precision, scale);
  }
  else {
    switch (typeString) {
      case DECIMAL:
        type = DataTypes.createDecimalType();
        break;
      case STRING:
        type = DataTypes.StringType;
        break;
      case FLOAT:
        type = DataTypes.FloatType;
        break;
      case DOUBLE:
        type = DataTypes.DoubleType;
        break;
      case BYTE:
        type = DataTypes.ByteType;
        break;
      case SHORT:
        type = DataTypes.ShortType;
        break;
      case INT:
        type = DataTypes.IntegerType;
        break;
      case LONG:
        type = DataTypes.LongType;
        break;
      case BOOLEAN:
        type = DataTypes.BooleanType;
        break;
      case BINARY:
        type = DataTypes.BinaryType;
        break;
      case DATE:
        type = DataTypes.DateType;
        break;
      case TIMESTAMP:
        type = DataTypes.TimestampType;
        break;
      default:
        throw new RuntimeException("Unsupported or unrecognized field type: " + typeString);
    } 
  }

  return type;
}
 
Example 17
Source File: SQLHepler.java    From sylph with Apache License 2.0 4 votes vote down vote up
static DataType getSparkType(Type type)
{
    if (type instanceof ParameterizedType && ((ParameterizedType) type).getRawType() == Map.class) {
        Type[] arguments = ((ParameterizedType) type).getActualTypeArguments();

        return DataTypes.createMapType(getSparkType(arguments[0]), getSparkType(arguments[1]));
    }
    else if (type instanceof ParameterizedType && ((ParameterizedType) type).getRawType() == List.class) {
        DataType dataType = getSparkType(((ParameterizedType) type).getActualTypeArguments()[0]);

        return DataTypes.createArrayType(dataType);
    }
    else {
        if (type == String.class) {
            return DataTypes.StringType;
        }
        else if (type == int.class || type == Integer.class) {
            return DataTypes.IntegerType;
        }
        else if (type == long.class || type == Long.class) {
            return DataTypes.LongType;
        }
        else if (type == boolean.class || type == Boolean.class) {
            return DataTypes.BooleanType;
        }
        else if (type == double.class || type == Double.class) {
            return DataTypes.DoubleType;
        }
        else if (type == float.class || type == Float.class) {
            return DataTypes.FloatType;
        }
        else if (type == byte.class || type == Byte.class) {
            return DataTypes.ByteType;
        }
        else if (type == Timestamp.class) {
            return DataTypes.TimestampType;
        }
        else if (type == Date.class) {
            return DataTypes.DateType;
        }
        else if (type == byte[].class || type == Byte[].class) {
            return DataTypes.BinaryType;
        }
        else {
            throw new IllegalArgumentException("this TYPE " + type + " have't support!");
        }
    }
}
 
Example 18
Source File: StringSanitizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testStringSanitizer() {

	//prepare data
	JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
			RowFactory.create(1, "Jyoti complex near Sananda clothes store; English Bazar; Malda;WB;India,"),
			RowFactory.create(2, "hallalli vinayaka tent road c/o B K vishwanath Mandya"),
			RowFactory.create(3, "M.sathish S/o devudu Lakshmi opticals Gokavaram bus stand Rajhamundry 9494954476")
	));

	StructType schema = new StructType(new StructField[]{
			new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
			new StructField("rawText", DataTypes.StringType, false, Metadata.empty())
	});
	Dataset<Row> dataset = spark.createDataFrame(rdd, schema);
	dataset.show();

	//train model in spark
	StringSanitizer sparkModel = new StringSanitizer()
			.setInputCol("rawText")
			.setOutputCol("token");

	//Export this model
	byte[] exportedModel = ModelExporter.export(sparkModel);

	//Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> pairs = sparkModel.transform(dataset).select("rawText", "token").collectAsList();

	for (Row row : pairs) {
		Map<String, Object> data = new HashMap<String, Object>();
		data.put(sparkModel.getInputCol(), row.getString(0));
		transformer.transform(data);

		String[] actual = (String[]) data.get(sparkModel.getOutputCol());

		List<String> actualList = Arrays.asList(actual);
		List<String> expected = row.getList(1);

		assertTrue("both should be same", actualList.equals(expected));
	}
}
 
Example 19
Source File: DefinitionToSparkVisitor.java    From bunsen with Apache License 2.0 4 votes vote down vote up
@Override
public DataType getDataType() {
  return DataTypes.IntegerType;
}
 
Example 20
Source File: KuduOutput.java    From envelope with Apache License 2.0 4 votes vote down vote up
private StructType schemaFor(KuduTable table) {
  List<StructField> fields = Lists.newArrayList();

  for (ColumnSchema columnSchema : table.getSchema().getColumns()) {
    DataType fieldType;

    switch (columnSchema.getType()) {
      case DOUBLE:
        fieldType = DataTypes.DoubleType;
        break;
      case FLOAT:
        fieldType = DataTypes.FloatType;
        break;
      case INT8:
        fieldType = DataTypes.ByteType;
        break;
      case INT16:
        fieldType = DataTypes.ShortType;
        break;
      case INT32:
        fieldType = DataTypes.IntegerType;
        break;
      case INT64:
        fieldType = DataTypes.LongType;
        break;
      case STRING:
        fieldType = DataTypes.StringType;
        break;
      case BOOL:
        fieldType = DataTypes.BooleanType;
        break;
      case BINARY:
        fieldType = DataTypes.BinaryType;
        break;
      case UNIXTIME_MICROS:
        fieldType = DataTypes.TimestampType;
        break;
      case DECIMAL:
        int precision = columnSchema.getTypeAttributes().getPrecision();
        int scale = columnSchema.getTypeAttributes().getScale();
        fieldType = DataTypes.createDecimalType(precision, scale);
        break;
      default:
        throw new RuntimeException("Unsupported Kudu column type: " + columnSchema.getType());
    }

    fields.add(DataTypes.createStructField(columnSchema.getName(), fieldType, true));
  }

  return DataTypes.createStructType(fields);
}