org.apache.spark.sql.Row Java Examples

The following examples show how to use org.apache.spark.sql.Row. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDataSourceManager.java    From DDF with Apache License 2.0 8 votes vote down vote up
@Override
public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException {
    SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager;
    HiveContext sqlContext = sparkDDFManager.getHiveContext();

    JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials();
    String fullURL = dataSource.getDataSourceUri().getUri().toString();
    if (cred.getUsername() != null &&  !cred.getUsername().equals("")) {
        fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword());
    }

    Map<String, String> options = new HashMap<String, String>();
    options.put("url", fullURL);
    options.put("dbtable", dataSource.getDbTable());
    DataFrame df = sqlContext.load("jdbc", options);

    DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class},
        null, SparkUtils.schemaFromDataFrame(df));
    // TODO?
    ddf.getRepresentationHandler().get(RDD.class, Row.class);
    ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource);
    return ddf;
}
 
Example #2
Source File: StructureToInteractingResidues.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Tuple2<String, StructureDataInterface> t) throws Exception {
	String structureId = t._1;
	StructureDataInterface structure = t._2;
	
	List<Integer> groupIndices = new ArrayList<>();
	List<String> groupNames = new ArrayList<>();		
	getGroupIndices(structure, groupIndices, groupNames);		

	List<Row> neighbors = new ArrayList<>();
	for (int i = 0; i < groupNames.size(); i++) {
		if (groupNames.get(i).equals(groupName)) {
			List<Integer> matches = new ArrayList<>();
			float[] boundingBox = calcBoundingBox(structure, groupIndices, i, cutoffDistance);
			matches.addAll(findNeighbors(structure, i, boundingBox, groupIndices));
			neighbors.addAll(getDistanceProfile(structureId, matches, i, groupIndices, groupNames, structure));
		}
	}
	
	return neighbors.iterator();
}
 
Example #3
Source File: GroupConcatDistinctUDAF.java    From BigDataPlatform with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 更新
 * 可以认为是,一个一个地将组内的字段值传递进来
 * 实现拼接的逻辑
 */
@Override
public void update(MutableAggregationBuffer buffer, Row input) {
	// 缓冲中的已经拼接过的城市信息串
	String bufferCityInfo = buffer.getString(0);
	// 刚刚传递进来的某个城市信息
	String cityInfo = input.getString(0);
	
	// 在这里要实现去重的逻辑
	// 判断:之前没有拼接过某个城市信息,那么这里才可以接下去拼接新的城市信息
	if(!bufferCityInfo.contains(cityInfo)) {
		if("".equals(bufferCityInfo)) {
			bufferCityInfo += cityInfo;
		} else {
			// 比如1:北京
			// 1:北京,2:上海
			bufferCityInfo += "," + cityInfo;
		}
		
		buffer.update(0, bufferCityInfo);  
	}
}
 
Example #4
Source File: TestPassthroughDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testPassthrough() throws Exception {
  StructType schema = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col1", DataTypes.StringType, false)));
  Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("a")), schema);
  Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("b")), schema);
  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dep1);
  dependencies.put("dep2", dep2);

  Deriver deriver = new PassthroughDeriver();

  List<Row> result = deriver.derive(dependencies).collectAsList();

  assertTrue(result.contains(RowFactory.create("a")));
  assertTrue(result.contains(RowFactory.create("b")));
  assertEquals(2, result.size());
}
 
Example #5
Source File: SparkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
    SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc()))
            .config(sc.getConf()).enableHiveSupport().getOrCreate();
    final Dataset intermediateTable = sparkSession.table(hiveTable);
    return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
        @Override
        public String[] call(Row row) throws Exception {
            String[] result = new String[row.size()];
            for (int i = 0; i < row.size(); i++) {
                final Object o = row.get(i);
                if (o != null) {
                    result[i] = o.toString();
                } else {
                    result[i] = null;
                }
            }
            return result;
        }
    });
}
 
Example #6
Source File: AvroTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
private Row rowForRecord(GenericRecord record) {
  List<Object> values = Lists.newArrayList();

  for (Field field : record.getSchema().getFields()) {
    Object value = record.get(field.name());

    Type fieldType = field.schema().getType();
    if (fieldType.equals(Type.UNION)) {
      fieldType = field.schema().getTypes().get(1).getType();
    }
    // Avro returns Utf8s for strings, which Spark SQL doesn't know how to use
    if (fieldType.equals(Type.STRING) && value != null) {
      value = value.toString();
    }
    // Avro returns binary as a ByteBuffer, but Spark SQL wants a byte[]
    if (fieldType.equals(Type.BYTES) && value != null) {
      value = ((ByteBuffer)value).array();
    }

    values.add(value);
  }

  return new RowWithSchema(schema, values.toArray());
}
 
Example #7
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example #8
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void createJavaSchemaRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = PowerMockito.mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    Cells cells = mock(Cells.class);
    when(singleRdd.first()).thenReturn(cells);
    StructType schema = mock(StructType.class);
    mockStatic(CellsUtils.class);
    when(CellsUtils.getStructTypeFromCells(cells)).thenReturn(schema);

    deepSparkContextSpy.createJavaSchemaRDD(config);

    verify(sqlContext).applySchema(rowRDD, schema);
}
 
Example #9
Source File: SubStringCounterRelation.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Row> buildScan() {
  log.debug("-> buildScan()");

  // I have isolated the work to a method to keep the plumbing code as simple
  // as
  // possible.
  List<List<Integer>> table = collectData();

  @SuppressWarnings("resource") // cannot be closed here, done elsewhere
  JavaSparkContext sparkContext = new JavaSparkContext(sqlContext
      .sparkContext());
  JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
      .map(row -> RowFactory.create(row.toArray()));

  return rowRDD.rdd();
}
 
Example #10
Source File: BundlesTest.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Test
public void testXmlBundleStrings() {

  JavaRDD<String> xmlBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/xml/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(),
      Encoders.STRING());

  xmlBundles.write().saveAsTable("xml_bundle_table");

  JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml(
      spark.sql("select value from xml_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundles,
      Patient.class);

  checkPatients(patients);
}
 
Example #11
Source File: PerformanceCollector.java    From ExecDashboard with Apache License 2.0 6 votes vote down vote up
private void updateCollectorItemMetricDetail(CollectorItemMetricDetail collectorItemMetricDetail, Row row) {
    Date timeWindowDt = row.getAs(STR_TIMEWINDOW);
    List<String> performanceMetricList = Arrays.asList(STR_AVG_RESPONSE_TIME,STR_CALLSPER_MINUTE,STR_ERROR_RATE);
    GenericRowWithSchema pefMetrics = row.getAs("metrics");

    for(String perfMetric :performanceMetricList){
        double value;
        try {
            Long valueStr = pefMetrics.getAs(perfMetric);
            value = valueStr.doubleValue();
        }catch (IllegalArgumentException exception){
            value = 0.0;
        }

        MetricCount mc = getMetricCount("", value, perfMetric);
        if (!mc.getLabel().isEmpty()) {
            collectorItemMetricDetail.setStrategy(getCollectionStrategy());
            collectorItemMetricDetail.addCollectorItemMetricCount(timeWindowDt, mc);
            collectorItemMetricDetail.setLastScanDate(timeWindowDt);
        }
    }
}
 
Example #12
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession,
                                                  JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector)
{
	if( !mc.colsKnown() )
		throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
	
	//slice blocks into rows, align and convert into data frame rows
	JavaRDD<Row> rowsRDD = in
		.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize()))
		.groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector));
	
	//create data frame schema
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
	if( toVector )
		fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
	else { // row
		for(int i = 1; i <= mc.getCols(); i++)
			fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false));
	}
	
	//rdd to data frame conversion
	return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
 
Example #13
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example #14
Source File: VisualizerTest.java    From MegaSparkDiff with Apache License 2.0 5 votes vote down vote up
private Pair<Dataset<Row>,Dataset<Row>> getAppleTablePair(String testName1, String testName2){
    AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
            "SA",
            "",
            "(select * from " + testName1 + ")", "table1");

    AppleTable rightAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "jdbc:hsqldb:hsql://127.0.0.1:9001/testDb",
            "SA",
            "",
            "(select * from " + testName2 + ")", "table2");

    return SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);
}
 
Example #15
Source File: DataFrameReader.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> json(final RDD<String> jsonRDD) {
  final boolean userTriggered = initializeFunction(jsonRDD);
  final Dataset<Row> result = Dataset.from(super.json(jsonRDD));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #16
Source File: AvroTranslator.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Iterable<Row> translate(Row message) throws Exception {
  byte[] value = message.getAs(Translator.VALUE_FIELD_NAME);

  Decoder decoder = DecoderFactory.get().binaryDecoder(value, null);
  GenericRecord record = reader.read(null, decoder);
  Row row = rowForRecord(record);

  return Collections.singleton(row);
}
 
Example #17
Source File: DataFrameToSequenceCreateCombiner.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public List<List<Writable>> call(Iterable<Row> rows) throws Exception {
    List<List<Writable>> retSeq = new ArrayList<>();
    for (Row v1 : rows) {
        List<Writable> ret = DataFrames.rowToWritables(schema, v1);
        retSeq.add(ret);
    }
    return retSeq;
}
 
Example #18
Source File: ParallelRowReadWriteDataSource.java    From spark-data-sources with MIT License 5 votes vote down vote up
@Override
public DataReader<Row> createDataReader() {
    log.info("Factory creating reader for [" + _host + ":" + _port + "]" );
    try {
        return new TaskDataReader(_host, _port, _table, _schema, _split);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    }
}
 
Example #19
Source File: RDDConverterUtilsExt.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Add element indices as new column to DataFrame
 *
 * @param df input data frame
 * @param sparkSession the Spark Session
 * @param nameOfCol name of index column
 * @return new data frame
 */
public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
	StructField[] oldSchema = df.schema().fields();
	StructField[] newSchema = new StructField[oldSchema.length + 1];
	for(int i = 0; i < oldSchema.length; i++) {
		newSchema[i] = oldSchema[i];
	}
	newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
	// JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
	JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
	return sparkSession.createDataFrame(newRows, new StructType(newSchema));
}
 
Example #20
Source File: SparkCubingJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example #21
Source File: JavaDCTExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDCTExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
    RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
    RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", new VectorUDT(), false, Metadata.empty()),
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  DCT dct = new DCT()
    .setInputCol("features")
    .setOutputCol("featuresDCT")
    .setInverse(false);

  Dataset<Row> dctDf = dct.transform(df);

  dctDf.select("featuresDCT").show(false);
  // $example off$

  spark.stop();
}
 
Example #22
Source File: TestEventTimeHistoryPlanner.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testOneArrivingOneExistingWhereArrivingLaterThanExistingButSameValues() {
  p = new EventTimeHistoryPlanner();
  assertNoValidationFailures(p, config);
  p.configure(config);

  existing.add(new RowWithSchema(existingSchema, "a", "hello", 100L, 100L, 253402214400000L, EventTimeHistoryPlanner.CURRENT_FLAG_DEFAULT_YES, ""));
  arriving.add(new RowWithSchema(arrivingSchema, "a", "hello", 200L));
  key = new RowWithSchema(keySchema, "a");

  List<Row> planned = p.planMutationsForKey(key, arriving, existing);

  assertEquals(planned.size(), 0);
}
 
Example #23
Source File: RandomForestRegressionModelInfoAdapterBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testRandomForestRegression() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestRegressionModel regressionModel = new RandomForestRegressor()
            .setFeaturesCol("features").fit(trainingData);

    byte[] exportedModel = ModelExporter.export(regressionModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = regressionModel.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        System.out.println(actual + ", " + predicted);
        assertEquals(actual, predicted, EPSILON);
    }
}
 
Example #24
Source File: TextFileToDataset.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Dataset from Text File")
      .master("local[*]")
      .getOrCreate();

  String filename = "data/simple-data-file.txt";
  Dataset<Row> df = spark.read().text(filename);
  df.show();
}
 
Example #25
Source File: SelectDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  dependencyCheck(dependencies);
  Dataset<Row> sourceStep = dependencies.get(stepName);
  if (useIncludeFields){
      if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
          throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      String firstCol = includeFields.get(0);
      includeFields.remove(0);
      return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
  } else {
      if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
          throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
  }
}
 
Example #26
Source File: TranslateFunction.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Iterator<Row> call(Row message) throws Exception {
  validateMessageSchema(message);
  Iterable<Row> translationResults;

  try {
    translationResults = getTranslator().translate(message);
  }
  catch (Exception e) {
    Row error = appendHadErrorFlag(message, true);
    return Collections.singleton(error).iterator();
  }

  List<Row> translated = Lists.newArrayList();
  for (Row translationResult : translationResults) {
    validateTranslatedSchema(translationResult);

    if (doesAppendRaw()) {
      translationResult = appendRawFields(translationResult, message);
    }
    translationResult = appendHadErrorFlag(translationResult, false);

    translated.add(translationResult);
  }

  return translated.iterator();
}
 
Example #27
Source File: NBuildSourceInfo.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public Dataset<Row> getParentDS() {
    if (!StringUtils.isBlank(parentStoragePath)) {
        logger.info("parent storage path exists, read from it. path:{}", parentStoragePath);
        Preconditions.checkNotNull(ss, "SparkSession is null is NBuildSourceInfo.");
        return ss.read().parquet(parentStoragePath);
    } else {
        Preconditions.checkState(flattableDS != null, "Path and DS can no be empty at the same time.");
        logger.info("parent storage path not exists, use flattable dataset.");
        return flattableDS;
    }
}
 
Example #28
Source File: JavaVectorAssemblerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example #29
Source File: SparkDataFile.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public SparkDataFile wrap(Row row) {
  this.wrapped = row;
  if (wrappedPartition.size() > 0) {
    this.wrappedPartition.wrap(row.getAs(partitionPosition));
  }
  return this;
}
 
Example #30
Source File: TestBitemporalHistoryPlanner.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testOneArrivingMultipleExistingWhereArrivingEarlierThanAllExisting() {
  p = new BitemporalHistoryPlanner();
  assertNoValidationFailures(p, config);
  p.configure(config);

  existing.add(new RowWithSchema(existingSchema, "a", "hello", 100L, 100L, 253402214400000L, 1L, 2L, CURRENT_FLAG_DEFAULT_NO));
  existing.add(new RowWithSchema(existingSchema, "a", "hello", 100L, 100L, 199L, 3L, 253402214400000L, CURRENT_FLAG_DEFAULT_NO));
  existing.add(new RowWithSchema(existingSchema, "a", "hello!", 200L, 200L, 253402214400000L, 3L, 4L, CURRENT_FLAG_DEFAULT_NO));
  existing.add(new RowWithSchema(existingSchema, "a", "hello!", 200L, 200L, 299L, 5L, 253402214400000L, CURRENT_FLAG_DEFAULT_NO));
  existing.add(new RowWithSchema(existingSchema, "a", "hello?", 300L, 300L, 253402214400000L, 5L, 253402214400000L, CURRENT_FLAG_DEFAULT_YES));
  arriving.add(new RowWithSchema(arrivingSchema, "a", "world", 50L));
  Row key = new RowWithSchema(keySchema, "a");

  List<Row> planned = p.planMutationsForKey(key, arriving, existing);

  assertEquals(planned.size(), 1);

  Long systemStart0 = planned.get(0).getAs("systemstart");

  assertEquals(planned.get(0).getAs("value"), "world");
  assertEquals(planned.get(0).getAs("eventstart"), 50L);
  assertEquals(planned.get(0).getAs("eventend"), 99L);
  assertTrue(systemStart0 >= preplanSystemTime);
  assertTrue(systemStart0 < preplanSystemTime + 5000);
  assertEquals(planned.get(0).getAs("systemend"), 253402214400000L);
  assertEquals(planned.get(0).getAs("currentflag"), CURRENT_FLAG_DEFAULT_NO);
}