org.datavec.arrow.ArrowConverter Java Examples

The following examples show how to use org.datavec.arrow.ArrowConverter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ArrowWritableRecordBatch.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Override
public List<Writable> get(int i) {
    List<Writable> ret = new ArrayList<>(schema.numColumns());
    for(int column = 0; column < schema.numColumns(); column++) {
        try {
            if (!list.get(column).isNull(offset + i))
                ret.add(ArrowConverter.fromEntry(offset + i, list.get(column), schema.getType(column)));
            else {
                ret.add(NullWritable.INSTANCE);
            }
        }catch (Exception e) {
            ret.add(NullWritable.INSTANCE);

        }
    }
    return ret;
}
 
Example #2
Source File: ArrowWritableRecordBatch.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Override
public List<Writable> set(int i, List<Writable> writable) {
    int rowOffset = offset + i;
    List<Writable> old = get(i);
    if(writable.size() != schema.numColumns()) {
        throw new IllegalArgumentException("Unable to set value. Wrong input types coming in");
    }

    int colIdx = 0;
    for(FieldVector fieldVector : list) {
        ArrowConverter.setValue(schema.getType(colIdx),fieldVector,writable.get(colIdx),rowOffset);
        colIdx++;
    }

    return old;
}
 
Example #3
Source File: ArrowRecordWriter.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Override
public PartitionMetaData writeBatch(List<List<Writable>> batch) throws IOException {
    if(partitioner.needsNewPartition()) {
        partitioner.currentOutputStream().flush();
        partitioner.currentOutputStream().close();
        partitioner.openNewStream();
    }

    if(batch instanceof ArrowWritableRecordBatch) {
        ArrowWritableRecordBatch arrowWritableRecordBatch = (ArrowWritableRecordBatch) batch;
        ArrowConverter.writeRecordBatchTo(arrowWritableRecordBatch,schema,partitioner.currentOutputStream());
    }
    else {
        ArrowConverter.writeRecordBatchTo(batch, schema, partitioner.currentOutputStream());
    }

    partitioner.currentOutputStream().flush();
    return PartitionMetaData.builder().numRecordsUpdated(batch.size()).build();
}
 
Example #4
Source File: ArrowWritableRecordBatch.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public List<Writable> get(int i) {
    List<Writable> ret = new ArrayList<>(schema.numColumns());
    for(int column = 0; column < schema.numColumns(); column++) {
        try {
            if (!list.get(column).isNull(offset + i))
                ret.add(ArrowConverter.fromEntry(offset + i, list.get(column), schema.getType(column)));
            else {
                ret.add(NullWritable.INSTANCE);
            }
        }catch (Exception e) {
            ret.add(NullWritable.INSTANCE);

        }
    }
    return ret;
}
 
Example #5
Source File: ArrowWritableRecordBatch.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public List<Writable> set(int i, List<Writable> writable) {
    int rowOffset = offset + i;
    List<Writable> old = get(i);
    if(writable.size() != schema.numColumns()) {
        throw new IllegalArgumentException("Unable to set value. Wrong input types coming in");
    }

    int colIdx = 0;
    for(FieldVector fieldVector : list) {
        ArrowConverter.setValue(schema.getType(colIdx),fieldVector,writable.get(colIdx),rowOffset);
        colIdx++;
    }

    return old;
}
 
Example #6
Source File: ArrowRecordWriter.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public PartitionMetaData writeBatch(List<List<Writable>> batch) throws IOException {
    if(partitioner.needsNewPartition()) {
        partitioner.currentOutputStream().flush();
        partitioner.currentOutputStream().close();
        partitioner.openNewStream();
    }

    if(batch instanceof ArrowWritableRecordBatch) {
        ArrowWritableRecordBatch arrowWritableRecordBatch = (ArrowWritableRecordBatch) batch;
        ArrowConverter.writeRecordBatchTo(arrowWritableRecordBatch,schema,partitioner.currentOutputStream());
    }
    else {
        ArrowConverter.writeRecordBatchTo(batch, schema, partitioner.currentOutputStream());
    }

    partitioner.currentOutputStream().flush();
    return PartitionMetaData.builder().numRecordsUpdated(batch.size()).build();
}
 
Example #7
Source File: ArrowWritableRecordTimeSeriesBatchTests.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicIndexing() {
    Schema.Builder schema = new Schema.Builder();
    for(int i = 0; i < 3; i++) {
        schema.addColumnInteger(String.valueOf(i));
    }


    List<List<Writable>> timeStep = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1),new IntWritable(2)),
            Arrays.<Writable>asList(new IntWritable(1),new IntWritable(2),new IntWritable(3)),
            Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5),new IntWritable(6))
    );

    int numTimeSteps = 5;
    List<List<List<Writable>>> timeSteps = new ArrayList<>(numTimeSteps);
    for(int i = 0; i < numTimeSteps; i++) {
        timeSteps.add(timeStep);
    }

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumnsTimeSeries(bufferAllocator, schema.build(), timeSteps);
    assertEquals(3,fieldVectors.size());
    for(FieldVector fieldVector : fieldVectors) {
        for(int i = 0; i < fieldVector.getValueCount(); i++) {
            assertFalse("Index " + i + " was null for field vector " + fieldVector, fieldVector.isNull(i));
        }
    }

    ArrowWritableRecordTimeSeriesBatch arrowWritableRecordTimeSeriesBatch = new ArrowWritableRecordTimeSeriesBatch(fieldVectors,schema.build(),timeStep.size() * timeStep.get(0).size());
    assertEquals(timeSteps,arrowWritableRecordTimeSeriesBatch.toArrayList());
}
 
Example #8
Source File: LocalTransformExecutor.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Execute a join on the specified data
 *
 * @param join  Join to execute
 * @param left  Left data for join
 * @param right Right data for join
 * @return Joined data
 */
public static List<List<Writable>> executeJoin(Join join, List<List<Writable>> left,
                                               List<List<Writable>> right) {

    String[] leftColumnNames = join.getJoinColumnsLeft();
    int[] leftColumnIndexes = new int[leftColumnNames.length];
    for (int i = 0; i < leftColumnNames.length; i++) {
        leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]);
    }
    ExtractKeysFunction extractKeysFunction1 = new ExtractKeysFunction(leftColumnIndexes);

    List<Pair<List<Writable>, List<Writable>>> leftJV = left.stream()
            .filter(input -> input.size() != leftColumnNames.length).map(input ->
            extractKeysFunction1.apply(input)).collect(toList());

    String[] rightColumnNames = join.getJoinColumnsRight();
    int[] rightColumnIndexes = new int[rightColumnNames.length];
    for (int i = 0; i < rightColumnNames.length; i++) {
        rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]);
    }

    ExtractKeysFunction extractKeysFunction = new ExtractKeysFunction(rightColumnIndexes);
    List<Pair<List<Writable>, List<Writable>>> rightJV =
            right.stream().filter(input -> input.size() != rightColumnNames.length)
                    .map(input -> extractKeysFunction.apply(input))
                    .collect(toList());

    Map<List<Writable>, Pair<List<List<Writable>>, List<List<Writable>>>> cogroupedJV = FunctionalUtils.cogroup(leftJV, rightJV);
    ExecuteJoinFromCoGroupFlatMapFunction executeJoinFromCoGroupFlatMapFunction = new ExecuteJoinFromCoGroupFlatMapFunction(join);
    List<List<Writable>> ret =  cogroupedJV.entrySet().stream()
            .flatMap(input ->
                    executeJoinFromCoGroupFlatMapFunction.call(Pair.of(input.getKey(),input.getValue())).stream())
            .collect(toList());

    Schema retSchema = join.getOutputSchema();
    return ArrowConverter.toArrowWritables(ArrowConverter.toArrowColumns(bufferAllocator,retSchema,ret),retSchema);

}
 
Example #9
Source File: CSVSparkTransform.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}
 
Example #10
Source File: ArrowWritableRecordTimeSeriesBatchTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicIndexing() {
    Schema.Builder schema = new Schema.Builder();
    for(int i = 0; i < 3; i++) {
        schema.addColumnInteger(String.valueOf(i));
    }


    List<List<Writable>> timeStep = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1),new IntWritable(2)),
            Arrays.<Writable>asList(new IntWritable(1),new IntWritable(2),new IntWritable(3)),
            Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5),new IntWritable(6))
    );

    int numTimeSteps = 5;
    List<List<List<Writable>>> timeSteps = new ArrayList<>(numTimeSteps);
    for(int i = 0; i < numTimeSteps; i++) {
        timeSteps.add(timeStep);
    }

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumnsTimeSeries(bufferAllocator, schema.build(), timeSteps);
    assertEquals(3,fieldVectors.size());
    for(FieldVector fieldVector : fieldVectors) {
        for(int i = 0; i < fieldVector.getValueCount(); i++) {
            assertFalse("Index " + i + " was null for field vector " + fieldVector, fieldVector.isNull(i));
        }
    }

    ArrowWritableRecordTimeSeriesBatch arrowWritableRecordTimeSeriesBatch = new ArrowWritableRecordTimeSeriesBatch(fieldVectors,schema.build(),timeStep.size() * timeStep.get(0).size());
    assertEquals(timeSteps,arrowWritableRecordTimeSeriesBatch.toArrayList());
}
 
Example #11
Source File: LocalTransformExecutor.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Execute a join on the specified data
 *
 * @param join  Join to execute
 * @param left  Left data for join
 * @param right Right data for join
 * @return Joined data
 */
public static List<List<Writable>> executeJoin(Join join, List<List<Writable>> left,
                                               List<List<Writable>> right) {

    String[] leftColumnNames = join.getJoinColumnsLeft();
    int[] leftColumnIndexes = new int[leftColumnNames.length];
    for (int i = 0; i < leftColumnNames.length; i++) {
        leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]);
    }
    ExtractKeysFunction extractKeysFunction1 = new ExtractKeysFunction(leftColumnIndexes);

    List<Pair<List<Writable>, List<Writable>>> leftJV = left.stream()
            .filter(input -> input.size() != leftColumnNames.length).map(input ->
            extractKeysFunction1.apply(input)).collect(toList());

    String[] rightColumnNames = join.getJoinColumnsRight();
    int[] rightColumnIndexes = new int[rightColumnNames.length];
    for (int i = 0; i < rightColumnNames.length; i++) {
        rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]);
    }

    ExtractKeysFunction extractKeysFunction = new ExtractKeysFunction(rightColumnIndexes);
    List<Pair<List<Writable>, List<Writable>>> rightJV =
            right.stream().filter(input -> input.size() != rightColumnNames.length)
                    .map(input -> extractKeysFunction.apply(input))
                    .collect(toList());

    Map<List<Writable>, Pair<List<List<Writable>>, List<List<Writable>>>> cogroupedJV = FunctionalUtils.cogroup(leftJV, rightJV);
    ExecuteJoinFromCoGroupFlatMapFunction executeJoinFromCoGroupFlatMapFunction = new ExecuteJoinFromCoGroupFlatMapFunction(join);
    List<List<Writable>> ret =  cogroupedJV.entrySet().stream()
            .flatMap(input ->
                    executeJoinFromCoGroupFlatMapFunction.call(Pair.of(input.getKey(),input.getValue())).stream())
            .collect(toList());

    Schema retSchema = join.getOutputSchema();
    return ArrowConverter.toArrowWritables(ArrowConverter.toArrowColumns(bufferAllocator,retSchema,ret),retSchema);

}
 
Example #12
Source File: CSVSparkTransform.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}