org.datavec.arrow.recordreader.ArrowWritableRecordBatch Java Examples

The following examples show how to use org.datavec.arrow.recordreader.ArrowWritableRecordBatch. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Read a datavec schema and record set
 * from the given arrow file.
 * @param input the input to read
 * @return the associated datavec schema and record
 */
public static Pair<Schema,ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException {
    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(input.getChannel());
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    //load the batch
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();

    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);

    return Pair.of(retSchema,ret);

}

Example #2

Source File: ArrowConverter.java From DataVec with Apache License 2.0

6 votes

/**
 * Read a datavec schema and record set
 * from the given arrow file.
 * @param input the input to read
 * @return the associated datavec schema and record
 */
public static Pair<Schema,ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException {
    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(input.getChannel());
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    //load the batch
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();

    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);

    return Pair.of(retSchema,ret);

}

Example #3

Source File: ArrowBinaryInputAdapterTest.java From konduit-serving with Apache License 2.0

6 votes

@Test(timeout = 60000)

    public void testArrowBinary() throws Exception {
        Schema irisInputSchema = TrainUtils.getIrisInputSchema();
        ArrowRecordWriter arrowRecordWriter = new ArrowRecordWriter(irisInputSchema);
        CSVRecordReader reader = new CSVRecordReader();
        reader.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));
        List<List<Writable>> writables = reader.next(150);

        File tmpFile = new File(temporary.getRoot(), "tmp.arrow");
        FileSplit fileSplit = new FileSplit(tmpFile);
        arrowRecordWriter.initialize(fileSplit, new NumberOfRecordsPartitioner());
        arrowRecordWriter.writeBatch(writables);
        byte[] arrowBytes = FileUtils.readFileToByteArray(tmpFile);

        Buffer buffer = Buffer.buffer(arrowBytes);
        ArrowBinaryInputAdapter arrowBinaryInputAdapter = new ArrowBinaryInputAdapter();
        ArrowWritableRecordBatch convert = arrowBinaryInputAdapter.convert(buffer, ConverterArgs.builder().schema(irisInputSchema).build(), null);
        assertEquals(writables.size(), convert.size());
    }

Example #4

Source File: ArrowConverter.java From DataVec with Apache License 2.0

6 votes

/**
 * Read a datavec schema and record set
 * from the given bytes (usually expected to be an arrow format file)
 * @param input the input to read
 * @return the associated datavec schema and record
 */
public static Pair<Schema,ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException {
    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input));
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    //load the batch
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();

    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);

    return Pair.of(retSchema,ret);

}

Example #5

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

6 votes

public static Pair<Schema, ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException {
    BufferAllocator allocator = new RootAllocator(9223372036854775807L);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input));
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();
    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch, retSchema, reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);
    return Pair.of(retSchema, ret);
}

Example #6

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

6 votes

public static Pair<Schema, ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException {
    BufferAllocator allocator = new RootAllocator(9223372036854775807L);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(input.getChannel());
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();
    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch, retSchema, reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);
    return Pair.of(retSchema, ret);
}

Example #7

Source File: ArrowConverterTest.java From DataVec with Apache License 2.0

6 votes

@Test
public void testArrowBatchSetTime() {
    Schema.Builder schema = new Schema.Builder();
    List<String> single = new ArrayList<>();
    for(int i = 0; i < 2; i++) {
        schema.addColumnTime(String.valueOf(i),TimeZone.getDefault());
        single.add(String.valueOf(i));
    }

    List<List<Writable>> input = Arrays.asList(
            Arrays.<Writable>asList(new LongWritable(0),new LongWritable(1)),
            Arrays.<Writable>asList(new LongWritable(2),new LongWritable(3))
    );

    List<FieldVector> fieldVector = ArrowConverter.toArrowColumns(bufferAllocator,schema.build(),input);
    ArrowWritableRecordBatch writableRecordBatch = new ArrowWritableRecordBatch(fieldVector,schema.build());
    List<Writable> assertion = Arrays.<Writable>asList(new LongWritable(4), new LongWritable(5));
    writableRecordBatch.set(1, Arrays.<Writable>asList(new LongWritable(4),new LongWritable(5)));
    List<Writable> recordTest = writableRecordBatch.get(1);
    assertEquals(assertion,recordTest);
}

Example #8

Source File: ArrowConverterTest.java From DataVec with Apache License 2.0

6 votes

@Test
public void testArrowBatchSet() {
    Schema.Builder schema = new Schema.Builder();
    List<String> single = new ArrayList<>();
    for(int i = 0; i < 2; i++) {
        schema.addColumnInteger(String.valueOf(i));
        single.add(String.valueOf(i));
    }

    List<List<Writable>> input = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1)),
            Arrays.<Writable>asList(new IntWritable(2),new IntWritable(3))
    );

    List<FieldVector> fieldVector = ArrowConverter.toArrowColumns(bufferAllocator,schema.build(),input);
    ArrowWritableRecordBatch writableRecordBatch = new ArrowWritableRecordBatch(fieldVector,schema.build());
    List<Writable> assertion = Arrays.<Writable>asList(new IntWritable(4), new IntWritable(5));
    writableRecordBatch.set(1, Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5)));
    List<Writable> recordTest = writableRecordBatch.get(1);
    assertEquals(assertion,recordTest);
}

Example #9

Source File: BaseJsonArrayConverter.java From konduit-serving with Apache License 2.0

6 votes

protected Pair<Map<Integer, Integer>, List<? extends Map<FieldName, ?>>> doTransformProcessConvertPmmlWithErrors(Schema schema, JsonArray jsonArray, TransformProcess transformProcess, DataPipelineErrorHandler dataPipelineErrorHandler) {
    Schema outputSchema = transformProcess.getFinalSchema();

    if (!transformProcess.getInitialSchema().equals(schema)) {
        throw new IllegalArgumentException("Transform process specified, but does not match target input inputSchema");
    }


    List<Map<FieldName, Object>> ret = new ArrayList<>(jsonArray.size());
    List<FieldName> fieldNames = getNameRepresentationFor(outputSchema);

    Pair<Map<Integer, Integer>, ArrowWritableRecordBatch> convertWithErrors = convertWithErrors(schema, jsonArray, transformProcess, dataPipelineErrorHandler);
    ArrowWritableRecordBatch conversion = convertWithErrors.getRight();
    for (int i = 0; i < conversion.size(); i++) {
        List<Writable> recordToMap = conversion.get(i);
        Map<FieldName, Object> record = new LinkedHashMap();
        for (int j = 0; j < outputSchema.numColumns(); j++) {
            record.put(fieldNames.get(j), WritableValueRetriever.getUnderlyingValue(recordToMap.get(j)));

        }

        ret.add(record);
    }

    return Pair.of(convertWithErrors.getKey(), ret);
}

Example #10

Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Read a datavec schema and record set
 * from the given bytes (usually expected to be an arrow format file)
 * @param input the input to read
 * @return the associated datavec schema and record
 */
public static Pair<Schema,ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException {
    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
    Schema retSchema = null;
    ArrowWritableRecordBatch ret = null;
    SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input));
    ArrowFileReader reader = new ArrowFileReader(channel, allocator);
    reader.loadNextBatch();
    retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema());
    //load the batch
    VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot());
    VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot());
    ArrowRecordBatch recordBatch = unloader.getRecordBatch();

    vectorLoader.load(recordBatch);
    ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot());
    ret.setUnloader(unloader);

    return Pair.of(retSchema,ret);

}

Example #11

Source File: ArrowConverterTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testToArrayFromINDArray() {
    Schema.Builder schemaBuilder = new Schema.Builder();
    schemaBuilder.addColumnNDArray("outputArray",new long[]{1,4});
    Schema schema = schemaBuilder.build();
    int numRows = 4;
    List<List<Writable>> ret = new ArrayList<>(numRows);
    for(int i = 0; i < numRows; i++) {
        ret.add(Arrays.<Writable>asList(new NDArrayWritable(Nd4j.linspace(1,4,4).reshape(1, 4))));
    }

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumns(bufferAllocator, schema, ret);
    ArrowWritableRecordBatch arrowWritableRecordBatch = new ArrowWritableRecordBatch(fieldVectors,schema);
    INDArray array = ArrowConverter.toArray(arrowWritableRecordBatch);
    assertArrayEquals(new long[]{4,4},array.shape());

    INDArray assertion = Nd4j.repeat(Nd4j.linspace(1,4,4),4).reshape(4,4);
    assertEquals(assertion,array);
}

Example #12

Source File: ArrowConverterTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testArrowBatchSetTime() {
    Schema.Builder schema = new Schema.Builder();
    List<String> single = new ArrayList<>();
    for(int i = 0; i < 2; i++) {
        schema.addColumnTime(String.valueOf(i),TimeZone.getDefault());
        single.add(String.valueOf(i));
    }

    List<List<Writable>> input = Arrays.asList(
            Arrays.<Writable>asList(new LongWritable(0),new LongWritable(1)),
            Arrays.<Writable>asList(new LongWritable(2),new LongWritable(3))
    );

    List<FieldVector> fieldVector = ArrowConverter.toArrowColumns(bufferAllocator,schema.build(),input);
    ArrowWritableRecordBatch writableRecordBatch = new ArrowWritableRecordBatch(fieldVector,schema.build());
    List<Writable> assertion = Arrays.<Writable>asList(new LongWritable(4), new LongWritable(5));
    writableRecordBatch.set(1, Arrays.<Writable>asList(new LongWritable(4),new LongWritable(5)));
    List<Writable> recordTest = writableRecordBatch.get(1);
    assertEquals(assertion,recordTest);
}

Example #13

Source File: ArrowConverterTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testArrowBatchSet() {
    Schema.Builder schema = new Schema.Builder();
    List<String> single = new ArrayList<>();
    for(int i = 0; i < 2; i++) {
        schema.addColumnInteger(String.valueOf(i));
        single.add(String.valueOf(i));
    }

    List<List<Writable>> input = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1)),
            Arrays.<Writable>asList(new IntWritable(2),new IntWritable(3))
    );

    List<FieldVector> fieldVector = ArrowConverter.toArrowColumns(bufferAllocator,schema.build(),input);
    ArrowWritableRecordBatch writableRecordBatch = new ArrowWritableRecordBatch(fieldVector,schema.build());
    List<Writable> assertion = Arrays.<Writable>asList(new IntWritable(4), new IntWritable(5));
    writableRecordBatch.set(1, Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5)));
    List<Writable> recordTest = writableRecordBatch.get(1);
    assertEquals(assertion,recordTest);
}

Example #14

Source File: ArrowConverter.java From DataVec with Apache License 2.0

5 votes

/**
 * Create an ndarray from a matrix.
 * The included batch must be all the same number of rows in order
 * to work. The reason for this is {@link INDArray} must be all the same dimensions.
 * Note that the input columns must also be numerical. If they aren't numerical already,
 * consider using an {@link org.datavec.api.transform.TransformProcess} to transform the data
 * output from {@link org.datavec.arrow.recordreader.ArrowRecordReader} in to the proper format
 * for usage with this method for direct conversion.
 *
 * @param arrowWritableRecordBatch the incoming batch. This is typically output from
 *                                 an {@link org.datavec.arrow.recordreader.ArrowRecordReader}
 * @return an {@link INDArray} representative of the input data
 */
public static INDArray toArray(ArrowWritableRecordBatch arrowWritableRecordBatch) {
    List<FieldVector> columnVectors = arrowWritableRecordBatch.getList();
    Schema schema = arrowWritableRecordBatch.getSchema();
    for(int i = 0; i < schema.numColumns(); i++) {
        switch(schema.getType(i)) {
            case Integer:
                break;
            case Float:
                break;
            case Double:
                break;
            case Long:
                break;
            default:
                throw new ND4JIllegalArgumentException("Illegal data type found for column " + schema.getName(i));
        }
    }

    int rows  = arrowWritableRecordBatch.getList().get(0).getValueCount();
    int cols = schema.numColumns();
    INDArray arr  = Nd4j.create(rows,cols);
    for(int i = 0; i < cols; i++) {
        INDArray put = ArrowConverter.convertArrowVector(columnVectors.get(i),schema.getType(i));
        switch(arr.data().dataType()) {
            case FLOAT:
                arr.putColumn(i,Nd4j.create(put.data().asFloat()).reshape(rows,1));
                break;
            case DOUBLE:
                arr.putColumn(i,Nd4j.create(put.data().asDouble()).reshape(rows,1));
                break;
        }

    }

    return arr;
}

Example #15

Source File: CSVSparkTransform.java From DataVec with Apache License 2.0

5 votes

/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}

Example #16

Source File: ArrowConverterTest.java From DataVec with Apache License 2.0

5 votes

@Test
public void testCreateNDArray() throws Exception {
    val recordsToWrite = recordToWrite();
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ArrowConverter.writeRecordBatchTo(recordsToWrite.getRight(),recordsToWrite.getFirst(),byteArrayOutputStream);

    File tmpFile = new File("tmp-arrow-file-" + UUID.randomUUID().toString() + ".arrorw");
    FileOutputStream outputStream = new FileOutputStream(tmpFile);
    tmpFile.deleteOnExit();
    ArrowConverter.writeRecordBatchTo(recordsToWrite.getRight(),recordsToWrite.getFirst(),outputStream);
    outputStream.flush();
    outputStream.close();

    Pair<Schema, ArrowWritableRecordBatch> schemaArrowWritableRecordBatchPair = ArrowConverter.readFromFile(tmpFile);
    assertEquals(recordsToWrite.getFirst(),schemaArrowWritableRecordBatchPair.getFirst());
    assertEquals(recordsToWrite.getRight(),schemaArrowWritableRecordBatchPair.getRight().toArrayList());

    byte[] arr = byteArrayOutputStream.toByteArray();
    val read = ArrowConverter.readFromBytes(arr);
    assertEquals(recordsToWrite,read);

    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    ArrowRecordReader recordReader = new ArrowRecordReader();

    recordReader.initialize(new FileSplit(tmp));

    recordReader.next();
    ArrowWritableRecordBatch currentBatch = recordReader.getCurrentBatch();
    INDArray arr2 = ArrowConverter.toArray(currentBatch);
    assertEquals(2,arr2.rows());
    assertEquals(2,arr2.columns());
}

Example #17

Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0

5 votes

private static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) {
    //iterate column wise over the feature vectors, returning entries
    List<FieldVector> fieldVectors = new ArrayList<>();
    for(int j = 0; j < schema.numColumns(); j++) {
        String name = schema.getName(j);
        FieldVector fieldVector = vectorLoader.getVector(name);
        fieldVectors.add(fieldVector);
    }

    ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema);
    ret.setArrowRecordBatch(arrowRecordBatch);

    return ret;
}

Example #18

Source File: ArrowConverterTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testArrowColumnINDArray() {
    Schema.Builder schema = new Schema.Builder();
    List<String> single = new ArrayList<>();
    int numCols = 2;
    INDArray arr = Nd4j.linspace(1,4,4);
    for(int i = 0; i < numCols; i++) {
        schema.addColumnNDArray(String.valueOf(i),new long[]{1,4});
        single.add(String.valueOf(i));
    }

    Schema buildSchema = schema.build();
    List<List<Writable>> list = new ArrayList<>();
    List<Writable> firstRow = new ArrayList<>();
    for(int i = 0 ; i < numCols; i++) {
        firstRow.add(new NDArrayWritable(arr));
    }

    list.add(firstRow);

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumns(bufferAllocator, buildSchema, list);
    assertEquals(numCols,fieldVectors.size());
    assertEquals(1,fieldVectors.get(0).getValueCount());
    assertFalse(fieldVectors.get(0).isNull(0));

    ArrowWritableRecordBatch arrowWritableRecordBatch = ArrowConverter.toArrowWritables(fieldVectors, buildSchema);
    assertEquals(1,arrowWritableRecordBatch.size());

    Writable writable = arrowWritableRecordBatch.get(0).get(0);
    assertTrue(writable instanceof NDArrayWritable);
    NDArrayWritable ndArrayWritable = (NDArrayWritable) writable;
    assertEquals(arr,ndArrayWritable.get());

    Writable writable1 = ArrowConverter.fromEntry(0, fieldVectors.get(0), ColumnType.NDArray);
    NDArrayWritable ndArrayWritablewritable1 = (NDArrayWritable) writable1;
    System.out.println(ndArrayWritablewritable1.get());

}

Example #19

Source File: ArrowConverter.java From DataVec with Apache License 2.0

5 votes

private static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) {
    //iterate column wise over the feature vectors, returning entries
    List<FieldVector> fieldVectors = new ArrayList<>();
    for(int j = 0; j < schema.numColumns(); j++) {
        String name = schema.getName(j);
        FieldVector fieldVector = vectorLoader.getVector(name);
        fieldVectors.add(fieldVector);
    }

    ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema);
    ret.setArrowRecordBatch(arrowRecordBatch);

    return ret;
}

Example #20

Source File: ArrowConverterTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testCreateNDArray() throws Exception {
    val recordsToWrite = recordToWrite();
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ArrowConverter.writeRecordBatchTo(recordsToWrite.getRight(),recordsToWrite.getFirst(),byteArrayOutputStream);

    File f = testDir.newFolder();

    File tmpFile = new File(f, "tmp-arrow-file-" + UUID.randomUUID().toString() + ".arrorw");
    FileOutputStream outputStream = new FileOutputStream(tmpFile);
    tmpFile.deleteOnExit();
    ArrowConverter.writeRecordBatchTo(recordsToWrite.getRight(),recordsToWrite.getFirst(),outputStream);
    outputStream.flush();
    outputStream.close();

    Pair<Schema, ArrowWritableRecordBatch> schemaArrowWritableRecordBatchPair = ArrowConverter.readFromFile(tmpFile);
    assertEquals(recordsToWrite.getFirst(),schemaArrowWritableRecordBatchPair.getFirst());
    assertEquals(recordsToWrite.getRight(),schemaArrowWritableRecordBatchPair.getRight().toArrayList());

    byte[] arr = byteArrayOutputStream.toByteArray();
    val read = ArrowConverter.readFromBytes(arr);
    assertEquals(recordsToWrite,read);

    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    ArrowRecordReader recordReader = new ArrowRecordReader();

    recordReader.initialize(new FileSplit(tmp));

    recordReader.next();
    ArrowWritableRecordBatch currentBatch = recordReader.getCurrentBatch();
    INDArray arr2 = ArrowConverter.toArray(currentBatch);
    assertEquals(2,arr2.rows());
    assertEquals(2,arr2.columns());
}

Example #21

Source File: CSVSparkTransform.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}

Example #22

Source File: PipelineExecutioner.java From konduit-serving with Apache License 2.0

5 votes

/**
 * Creates input for use in the {@link PipelineExecutioner}
 * @param input the input object
 * @param transformProcess the {@link TransformProcess} to use
 * @param conversionSchema The {@link Schema} to use
 * @return the DataVec type input records.
 */
public static Record[] createInput(Object input,TransformProcess transformProcess,Schema conversionSchema) {
    Preconditions.checkNotNull(input, "Input data was null!");

    if(input instanceof String) {
        String inputJson = (String) input;
        if (inputJson.charAt(0) == '{') {
            //json object
            log.info("Auto converting json object to json array");
            inputJson = "[" + input + "]";
        }

        JsonArray jsonArray = new JsonArray(inputJson);
        ArrowWritableRecordBatch convert;
        try {
            convert = mapConverter.convert(conversionSchema, jsonArray, transformProcess);
        } catch (Exception e) {
            log.error("Error performing conversion", e);
            throw e;
        }

        Preconditions.checkNotNull(convert, "Conversion was null!");
        Record[] pipelineInput = new Record[convert.size()];
        for (int i = 0; i < pipelineInput.length; i++) {
            pipelineInput[i] = new ArrowRecord(convert, i, null);
        }

        return pipelineInput;
    }

    else {
        //ndarrays already
        return (Record[]) input;
    }

}

Example #23

Source File: PipelineExecutioner.java From konduit-serving with Apache License 2.0

5 votes

private void writeArrowResponse(RoutingContext ctx, Schema outputSchema, ArrowWritableRecordBatch convert) {
    log.debug("Writing arrow response.");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ArrowUtils.writeRecordBatchTo(convert, outputSchema, byteArrayOutputStream);
    Buffer write = Buffer.buffer(byteArrayOutputStream.toByteArray());
    ctx.response().putHeader("Content-Type", "application/octet-stream");
    ctx.response().putHeader("Content-Length", String.valueOf(write.getBytes().length));
    ctx.response().end(write);
}

Example #24

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

5 votes

public static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) {
    List<FieldVector> fieldVectors = new ArrayList();

    for (int j = 0; j < schema.numColumns(); ++j) {
        String name = schema.getName(j);
        FieldVector fieldVector = vectorLoader.getVector(name);
        fieldVectors.add(fieldVector);
    }

    ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema);
    ret.setArrowRecordBatch(arrowRecordBatch);
    return ret;
}

Example #25

Source File: ArrowBinaryInputAdapter.java From konduit-serving with Apache License 2.0

5 votes

@Override
public ArrowWritableRecordBatch convert(Buffer input, ConverterArgs parameters, Map<String, Object> contextData) {
    ArrowRecordReader arrowRecordReader = new ArrowRecordReader();
    arrowRecordReader.initialize(new InputStreamInputSplit(new ByteArrayInputStream(input.getBytes())));
    arrowRecordReader.next();
    return arrowRecordReader.getCurrentBatch();
}

Example #26

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

5 votes

public static void writeRecordBatchTo(BufferAllocator bufferAllocator, List<List<Writable>> recordBatch, Schema inputSchema, OutputStream outputStream) {
    if (!(recordBatch instanceof ArrowWritableRecordBatch)) {
        convertWritables(bufferAllocator, recordBatch, inputSchema, outputStream);
    } else {
        convertWritables(bufferAllocator, recordBatch, inputSchema, outputStream);
    }

}

Example #27

Source File: BatchInputParser.java From konduit-serving with Apache License 2.0

4 votes

/**
 * Create a batch from the {@link RoutingContext}
 *
 * @param routingContext the routing context to create the batch from
 * @return the proper ndarray batch with the ndarrays merged
 * with a batch per input
 * @throws IOException I/O Exception
 */
public Record[] createBatch(RoutingContext routingContext) throws IOException {
    //partition the input content by name
    Map<String, List<BatchPartInfo>> partInfo = partInfoForUploads(routingContext);
    if (partInfo.isEmpty()) {
        throw new IllegalArgumentException("No parts resolved for file uploads!");
    } else if (!inputParts.containsAll(partInfo.keySet())) {
        throw new IllegalArgumentException("Illegal part info resolved. Part info keys were " + partInfo.keySet() + " while input parts were " + inputParts);
    }

    //batch size
    Record[] inputBatches = new Record[inputParts.size()];
    for (int j = 0; j < inputBatches.length; j++) {
        inputBatches[j] =
                new org.datavec.api.records.impl.Record(
                        new ArrayList<>(inputParts.size()),
                        null);
        inputBatches[j].getRecord().add(null);
    }

    Map<Integer, List<List<Writable>>> missingIndices = new LinkedHashMap<>();
    for (int i = 0; i < inputParts.size(); i++) {
        if (inputParts.get(i) == null || !partInfo.containsKey(inputParts.get(i))) {
            throw new IllegalStateException("No part found for part " + inputParts.get(i)
                    + " available parts " + partInfo.keySet());
        }

        List<BatchPartInfo> batch = partInfo.get(inputParts.get(i));
        for (int j = 0; j < batch.size(); j++) {
            Pair<String, Integer> partNameAndIndex = partNameAndIndex(batch.get(j).getPartName());
            Buffer buffer = loadBuffer(routingContext,
                    batch.get(j).getFileUploadPath());
            Object convert = convert(buffer, partNameAndIndex.getFirst(), null, routingContext);
            Preconditions.checkNotNull(convert, "Converted writable was null!");
            //set the name
            if (convert instanceof Writable) {
                Writable writable = (Writable) convert;
                inputBatches[i].getRecord().set(j, writable);
            } else {
                ArrowWritableRecordBatch arrow = (ArrowWritableRecordBatch) convert;
                missingIndices.put(j, arrow);
            }
        }
    }

    if (!missingIndices.isEmpty()) {
        List<Record> newRetRecords = new ArrayList<>();

        for (Map.Entry<Integer, List<List<Writable>>> entry : missingIndices.entrySet()) {
            for (List<Writable> record : entry.getValue()) {
                newRetRecords.add(new org.datavec.api.records.impl.Record(record, null));
            }
        }

        return newRetRecords.toArray(new Record[newRetRecords.size()]);
    }

    return inputBatches;
}

Example #28

Source File: BaseJsonArrayConverter.java From konduit-serving with Apache License 2.0

4 votes

@Override
public Pair<Map<Integer, Integer>, ArrowWritableRecordBatch> convertWithErrors(Schema schema, JsonArray jsonArray, DataPipelineErrorHandler dataPipelineErrorHandler) {
    return convertWithErrors(schema, jsonArray, null, dataPipelineErrorHandler);
}

Example #29

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

4 votes

public static ArrowWritableRecordBatch toArrowWritables(List<FieldVector> fieldVectors, Schema schema) {
    ArrowWritableRecordBatch arrowWritableRecordBatch = new ArrowWritableRecordBatch(fieldVectors, schema);
    return arrowWritableRecordBatch;
}

Example #30

Source File: ArrowUtils.java From konduit-serving with Apache License 2.0

4 votes

public static Pair<Schema, ArrowWritableRecordBatch> readFromFile(File input) throws IOException {
    return readFromFile(new FileInputStream(input));
}