org.datavec.api.split.InputSplit Java Exaples

Source File: JacksonRecordReaderTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testReadingJson() throws Exception {
    //Load 3 values from 3 JSON files
    //stricture: a:value, b:value, c:x:value, c:y:value
    //And we want to load only a:value, b:value and c:x:value
    //For first JSON file: all values are present
    //For second JSON file: b:value is missing
    //For third JSON file: c:x:value is missing

    ClassPathResource cpr = new ClassPathResource("datavec-api/json/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "json_test_%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()));
    rr.initialize(is);

    testJacksonRecordReader(rr);
}

Source File: BaseImageRecordReader.java From deeplearning4j with Apache License 2.0

6 votes

@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    this.appendLabel = conf.getBoolean(APPEND_LABEL, appendLabel);
    this.labels = new ArrayList<>(conf.getStringCollection(LABELS));
    this.height = conf.getLong(HEIGHT, height);
    this.width = conf.getLong(WIDTH, width);
    this.channels = conf.getLong(CHANNELS, channels);
    this.cropImage = conf.getBoolean(CROP_IMAGE, cropImage);
    if ("imageio".equals(conf.get(IMAGE_LOADER))) {
        this.imageLoader = new ImageLoader(height, width, channels, cropImage);
    } else {
        this.imageLoader = new NativeImageLoader(height, width, channels, imageTransform);
    }
    this.conf = conf;
    initialize(split);
}

Source File: LineReaderTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testLineReaderWithInputStreamInputSplit() throws Exception {
    File tmpdir = testDir.newFolder();

    File tmp1 = new File(tmpdir, "tmp1.txt.gz");

    OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false));
    IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os);
    os.flush();
    os.close();

    InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1)));

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    while (reader.hasNext()) {
        assertEquals(1, reader.next().size());
        count++;
    }

    assertEquals(9, count);
}

Source File: ListStringInputMarshaller.java From DataVec with Apache License 2.0

5 votes

/**
 * @param exchange
 * @return
 */
@Override
public InputSplit getSplit(Exchange exchange) {
    List<List<String>> data = (List<List<String>>) exchange.getIn().getBody();
    InputSplit listSplit = new ListStringSplit(data);
    return listSplit;
}

Source File: ListStringInputFormat.java From DataVec with Apache License 2.0

5 votes

/**
 * Creates a reader from an input split
 *
 * @param split the split to read
 * @return the reader from the given input split
 */
@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    RecordReader reader = new ListStringRecordReader();
    reader.initialize(split);
    return reader;
}

Source File: JacksonRecordReader.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    if (split instanceof FileSplit)
        throw new UnsupportedOperationException("Cannot use JacksonRecordReader with FileSplit");
    super.initialize(inputSplit);
    this.uris = split.locations();
    if (shuffle) {
        List<URI> list = Arrays.asList(uris);
        Collections.shuffle(list, r);
        uris = list.toArray(new URI[uris.length]);
    }
}

Source File: FileRecordWriter.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    this.partitioner = partitioner;

}

Source File: FileRecordReader.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    appendLabel = conf.getBoolean(APPEND_LABEL, true);
    doInitialize(split);
    this.inputSplit = split;
    this.conf = conf;
}

Source File: FileRecordWriter.java From DataVec with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    this.partitioner = partitioner;

}

Source File: LineInputFormat.java From DataVec with Apache License 2.0

5 votes

@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    LineRecordReader ret = new LineRecordReader();
    ret.initialize(split);
    return ret;

}

Source File: ExcelRecordWriter.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(Configuration configuration, InputSplit split, Partitioner partitioner) throws Exception {
    this.workBookName = configuration.get(WORKSHEET_NAME,DEFAULT_WORKSHEET_NAME);
    this.fileTypeToUse = configuration.get(FILE_TYPE,DEFAULT_FILE_TYPE);
    this.conf = configuration;
    partitioner.init(split);
    out = new DataOutputStream(partitioner.currentOutputStream());
    initPoi();
}

Source File: LineReaderTest.java From DataVec with Apache License 2.0

5 votes

@Test
public void testLineReader() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir-testLineReader");
    if (tmpdir.exists())
        tmpdir.delete();
    tmpdir.mkdir();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        List<Writable> l = reader.next();
        assertEquals(1, l.size());
        list.add(l);
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: VideoRecordReader.java From DataVec with Apache License 2.0

5 votes

@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    this.conf = conf;
    this.appendLabel = conf.getBoolean(APPEND_LABEL, false);
    this.height = conf.getInt(HEIGHT, height);
    this.width = conf.getInt(WIDTH, width);
    if ("imageio".equals(conf.get(IMAGE_LOADER))) {
        this.imageLoader = new ImageLoader(height, width);
    } else {
        this.imageLoader = new NativeImageLoader(height, width);
    }

    initialize(split);
}

Source File: BaseRecordReader.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    this.inputSplit = split;
    if(split instanceof StreamInputSplit){
        StreamInputSplit s = (StreamInputSplit)split;
        if(s.getStreamCreatorFn() != null){
            this.streamCreatorFn = s.getStreamCreatorFn();
        }
    }
}

Source File: LFWLoader.java From deeplearning4j with Apache License 2.0

5 votes

public RecordReader getRecordReader(long batchSize, long numExamples, long[] imgDim, long numLabels,
                PathLabelGenerator labelGenerator, boolean train, double splitTrainTest, Random rng) {
    load(batchSize, numExamples, numLabels, labelGenerator, splitTrainTest, rng);
    RecordReader recordReader =
                    new ImageRecordReader(imgDim[0], imgDim[1], imgDim[2], labelGenerator, imageTransform);

    try {
        InputSplit data = train ? inputSplit[0] : inputSplit[1];
        recordReader.initialize(data);
    } catch (IOException | InterruptedException e) {
        log.error("",e);
    }
    return recordReader;
}

Source File: LineReaderTest.java From DataVec with Apache License 2.0

5 votes

@Test
public void testLineReaderWithInputStreamInputSplit() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir");
    tmpdir.mkdir();

    File tmp1 = new File(tmpdir, "tmp1.txt.gz");

    OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false));
    IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os);
    os.flush();
    os.close();

    InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1)));

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    while (reader.hasNext()) {
        assertEquals(1, reader.next().size());
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: DataVecConsumer.java From DataVec with Apache License 2.0

5 votes

@Override
protected int poll() throws Exception {
    Exchange exchange = endpoint.createExchange();
    InputSplit split = inputFromExchange(exchange);
    RecordReader reader = inputFormat.createReader(split, configuration);
    int numMessagesPolled = 0;
    while (reader.hasNext()) {
        // create a message body
        while (reader.hasNext()) {
            exchange.getIn().setBody(reader.next());

            try {
                // send message to next processor in the route
                getProcessor().process(exchange);
                numMessagesPolled++; // number of messages polled
            } finally {
                // log exception if an exception occurred and was not handled
                if (exchange.getException() != null) {
                    getExceptionHandler().handleException("Error processing exchange", exchange,
                                    exchange.getException());
                }
            }
        }


    }

    return numMessagesPolled;
}

Source File: RegexRecordReaderTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testRegexSequenceRecordReaderMeta() throws Exception {
    String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)";

    ClassPathResource cpr = new ClassPathResource("datavec-api/logtestdata/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "logtestfile%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 1);

    SequenceRecordReader rr = new RegexSequenceRecordReader(regex, 1);
    rr.initialize(is);

    List<List<List<Writable>>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.sequenceRecord());
    }

    assertEquals(2, out.size());
    List<List<List<Writable>>> out2 = new ArrayList<>();
    List<SequenceRecord> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    rr.reset();
    while (rr.hasNext()) {
        SequenceRecord seqr = rr.nextSequence();
        out2.add(seqr.getSequenceRecord());
        out3.add(seqr);
        meta.add(seqr.getMetaData());
    }

    List<SequenceRecord> fromMeta = rr.loadSequenceFromMetaData(meta);

    assertEquals(out, out2);
    assertEquals(out3, fromMeta);
}

Source File: ExcelRecordWriter.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    this.conf = new Configuration();
    this.partitioner = partitioner;
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    initPoi();


}

Source File: FileRecordReaderTest.java From DataVec with Apache License 2.0

5 votes

@Test
public void testMeta() throws Exception {
    FileRecordReader rr = new FileRecordReader();


    URI[] arr = new URI[3];
    arr[0] = new ClassPathResource("csvsequence_0.txt").getFile().toURI();
    arr[1] = new ClassPathResource("csvsequence_1.txt").getFile().toURI();
    arr[2] = new ClassPathResource("csvsequence_2.txt").getFile().toURI();

    InputSplit is = new CollectionInputSplit(Arrays.asList(arr));
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }

    assertEquals(3, out.size());

    rr.reset();
    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    int count = 0;
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        out3.add(r);
        meta.add(r.getMetaData());

        assertEquals(arr[count++], r.getMetaData().getURI());
    }

    assertEquals(out, out2);
    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(out3, fromMeta);
}

Source File: ArrowRecordReader.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public Record loadFromMetaData(RecordMetaData recordMetaData) {
    if(!(recordMetaData instanceof RecordMetaDataIndex)) {
        throw new IllegalArgumentException("Unable to load from meta data. No index specified for record");
    }

    RecordMetaDataIndex index = (RecordMetaDataIndex) recordMetaData;
    InputSplit fileSplit = new FileSplit(new File(index.getURI()));
    initialize(fileSplit);
    this.currIdx = (int) index.getIndex();
    return nextRecord();
}

Source File: TestSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}

Source File: ArrowRecordWriter.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    this.partitioner = partitioner;
    partitioner.init(inputSplit);

}

Source File: ExcelRecordReader.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    super.initialize(conf, split);
    this.skipNumLines = conf.getInt(SKIP_NUM_LINES,0);
}

Source File: CSVInputFormat.java From DataVec with Apache License 2.0

4 votes

@Override
public RecordReader createReader(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    return createReader(split);
}

Source File: CSVNLinesSequenceRecordReader.java From DataVec with Apache License 2.0

4 votes

@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    super.initialize(conf, split);
    this.nLinesPerSequence = conf.getInt(LINES_PER_SEQUENCE, nLinesPerSequence);
}

Source File: MapFileSequenceRecordReader.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    initialize(null, split);
}

Source File: BaseInputFormat.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    return createReader(split, null);
}

Source File: TestPairSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
    //Goal: combine separate files together into a hadoop sequence file, for later parsing by a SequenceRecordReader
    //For example: use to combine input and labels data from separate files for training a RNN
    JavaSparkContext sc = getContext();

    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);

    Path p = Files.createTempDirectory("dl4j_rrbytesPairOut");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);

    //Load back into memory:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);

    SequenceRecordReader srr1 = getReader();
    SequenceRecordReader srr2 = getReader();
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);

    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    List<Tuple2<List<List<Writable>>, List<List<Writable>>>> fromSequenceFile = writables.collect();

    //Load manually (single copy) and compare:
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = getReader();
    srr.initialize(is);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        Tuple2<List<List<Writable>>, List<List<Writable>>> tuple2 = fromSequenceFile.get(i);
        List<List<Writable>> seq1 = tuple2._1();
        List<List<Writable>> seq2 = tuple2._2();
        assertEquals(seq1, seq2);

        for (int j = 0; j < 4; j++) {
            if (seq1.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions

}

Source File: ArrowRecordReader.java From deeplearning4j with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit split) {
    this.split = split;
    this.pathsIter = split.locationsPathIterator();
}

org.datavec.api.split.InputSplit Java Examples