org.datavec.api.split.InputSplit Java Examples

The following examples show how to use org.datavec.api.split.InputSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JacksonRecordReaderTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadingJson() throws Exception {
    //Load 3 values from 3 JSON files
    //stricture: a:value, b:value, c:x:value, c:y:value
    //And we want to load only a:value, b:value and c:x:value
    //For first JSON file: all values are present
    //For second JSON file: b:value is missing
    //For third JSON file: c:x:value is missing

    ClassPathResource cpr = new ClassPathResource("datavec-api/json/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "json_test_%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()));
    rr.initialize(is);

    testJacksonRecordReader(rr);
}
 
Example #2
Source File: BaseImageRecordReader.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    this.appendLabel = conf.getBoolean(APPEND_LABEL, appendLabel);
    this.labels = new ArrayList<>(conf.getStringCollection(LABELS));
    this.height = conf.getLong(HEIGHT, height);
    this.width = conf.getLong(WIDTH, width);
    this.channels = conf.getLong(CHANNELS, channels);
    this.cropImage = conf.getBoolean(CROP_IMAGE, cropImage);
    if ("imageio".equals(conf.get(IMAGE_LOADER))) {
        this.imageLoader = new ImageLoader(height, width, channels, cropImage);
    } else {
        this.imageLoader = new NativeImageLoader(height, width, channels, imageTransform);
    }
    this.conf = conf;
    initialize(split);
}
 
Example #3
Source File: LineReaderTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testLineReaderWithInputStreamInputSplit() throws Exception {
    File tmpdir = testDir.newFolder();

    File tmp1 = new File(tmpdir, "tmp1.txt.gz");

    OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false));
    IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os);
    os.flush();
    os.close();

    InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1)));

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    while (reader.hasNext()) {
        assertEquals(1, reader.next().size());
        count++;
    }

    assertEquals(9, count);
}
 
Example #4
Source File: ListStringInputMarshaller.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * @param exchange
 * @return
 */
@Override
public InputSplit getSplit(Exchange exchange) {
    List<List<String>> data = (List<List<String>>) exchange.getIn().getBody();
    InputSplit listSplit = new ListStringSplit(data);
    return listSplit;
}
 
Example #5
Source File: ListStringInputFormat.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a reader from an input split
 *
 * @param split the split to read
 * @return the reader from the given input split
 */
@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    RecordReader reader = new ListStringRecordReader();
    reader.initialize(split);
    return reader;
}
 
Example #6
Source File: JacksonRecordReader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    if (split instanceof FileSplit)
        throw new UnsupportedOperationException("Cannot use JacksonRecordReader with FileSplit");
    super.initialize(inputSplit);
    this.uris = split.locations();
    if (shuffle) {
        List<URI> list = Arrays.asList(uris);
        Collections.shuffle(list, r);
        uris = list.toArray(new URI[uris.length]);
    }
}
 
Example #7
Source File: FileRecordWriter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    this.partitioner = partitioner;

}
 
Example #8
Source File: FileRecordReader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    appendLabel = conf.getBoolean(APPEND_LABEL, true);
    doInitialize(split);
    this.inputSplit = split;
    this.conf = conf;
}
 
Example #9
Source File: FileRecordWriter.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    this.partitioner = partitioner;

}
 
Example #10
Source File: LineInputFormat.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    LineRecordReader ret = new LineRecordReader();
    ret.initialize(split);
    return ret;

}
 
Example #11
Source File: ExcelRecordWriter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(Configuration configuration, InputSplit split, Partitioner partitioner) throws Exception {
    this.workBookName = configuration.get(WORKSHEET_NAME,DEFAULT_WORKSHEET_NAME);
    this.fileTypeToUse = configuration.get(FILE_TYPE,DEFAULT_FILE_TYPE);
    this.conf = configuration;
    partitioner.init(split);
    out = new DataOutputStream(partitioner.currentOutputStream());
    initPoi();
}
 
Example #12
Source File: LineReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineReader() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir-testLineReader");
    if (tmpdir.exists())
        tmpdir.delete();
    tmpdir.mkdir();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        List<Writable> l = reader.next();
        assertEquals(1, l.size());
        list.add(l);
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #13
Source File: VideoRecordReader.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    this.conf = conf;
    this.appendLabel = conf.getBoolean(APPEND_LABEL, false);
    this.height = conf.getInt(HEIGHT, height);
    this.width = conf.getInt(WIDTH, width);
    if ("imageio".equals(conf.get(IMAGE_LOADER))) {
        this.imageLoader = new ImageLoader(height, width);
    } else {
        this.imageLoader = new NativeImageLoader(height, width);
    }

    initialize(split);
}
 
Example #14
Source File: BaseRecordReader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    this.inputSplit = split;
    if(split instanceof StreamInputSplit){
        StreamInputSplit s = (StreamInputSplit)split;
        if(s.getStreamCreatorFn() != null){
            this.streamCreatorFn = s.getStreamCreatorFn();
        }
    }
}
 
Example #15
Source File: LFWLoader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public RecordReader getRecordReader(long batchSize, long numExamples, long[] imgDim, long numLabels,
                PathLabelGenerator labelGenerator, boolean train, double splitTrainTest, Random rng) {
    load(batchSize, numExamples, numLabels, labelGenerator, splitTrainTest, rng);
    RecordReader recordReader =
                    new ImageRecordReader(imgDim[0], imgDim[1], imgDim[2], labelGenerator, imageTransform);

    try {
        InputSplit data = train ? inputSplit[0] : inputSplit[1];
        recordReader.initialize(data);
    } catch (IOException | InterruptedException e) {
        log.error("",e);
    }
    return recordReader;
}
 
Example #16
Source File: LineReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineReaderWithInputStreamInputSplit() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir");
    tmpdir.mkdir();

    File tmp1 = new File(tmpdir, "tmp1.txt.gz");

    OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false));
    IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os);
    os.flush();
    os.close();

    InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1)));

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    while (reader.hasNext()) {
        assertEquals(1, reader.next().size());
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #17
Source File: DataVecConsumer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
protected int poll() throws Exception {
    Exchange exchange = endpoint.createExchange();
    InputSplit split = inputFromExchange(exchange);
    RecordReader reader = inputFormat.createReader(split, configuration);
    int numMessagesPolled = 0;
    while (reader.hasNext()) {
        // create a message body
        while (reader.hasNext()) {
            exchange.getIn().setBody(reader.next());

            try {
                // send message to next processor in the route
                getProcessor().process(exchange);
                numMessagesPolled++; // number of messages polled
            } finally {
                // log exception if an exception occurred and was not handled
                if (exchange.getException() != null) {
                    getExceptionHandler().handleException("Error processing exchange", exchange,
                                    exchange.getException());
                }
            }
        }


    }

    return numMessagesPolled;
}
 
Example #18
Source File: RegexRecordReaderTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testRegexSequenceRecordReaderMeta() throws Exception {
    String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)";

    ClassPathResource cpr = new ClassPathResource("datavec-api/logtestdata/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "logtestfile%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 1);

    SequenceRecordReader rr = new RegexSequenceRecordReader(regex, 1);
    rr.initialize(is);

    List<List<List<Writable>>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.sequenceRecord());
    }

    assertEquals(2, out.size());
    List<List<List<Writable>>> out2 = new ArrayList<>();
    List<SequenceRecord> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    rr.reset();
    while (rr.hasNext()) {
        SequenceRecord seqr = rr.nextSequence();
        out2.add(seqr.getSequenceRecord());
        out3.add(seqr);
        meta.add(seqr.getMetaData());
    }

    List<SequenceRecord> fromMeta = rr.loadSequenceFromMetaData(meta);

    assertEquals(out, out2);
    assertEquals(out3, fromMeta);
}
 
Example #19
Source File: ExcelRecordWriter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    this.conf = new Configuration();
    this.partitioner = partitioner;
    partitioner.init(inputSplit);
    out = new DataOutputStream(partitioner.currentOutputStream());
    initPoi();


}
 
Example #20
Source File: FileRecordReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testMeta() throws Exception {
    FileRecordReader rr = new FileRecordReader();


    URI[] arr = new URI[3];
    arr[0] = new ClassPathResource("csvsequence_0.txt").getFile().toURI();
    arr[1] = new ClassPathResource("csvsequence_1.txt").getFile().toURI();
    arr[2] = new ClassPathResource("csvsequence_2.txt").getFile().toURI();

    InputSplit is = new CollectionInputSplit(Arrays.asList(arr));
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }

    assertEquals(3, out.size());

    rr.reset();
    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    int count = 0;
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        out3.add(r);
        meta.add(r.getMetaData());

        assertEquals(arr[count++], r.getMetaData().getURI());
    }

    assertEquals(out, out2);
    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(out3, fromMeta);
}
 
Example #21
Source File: ArrowRecordReader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public Record loadFromMetaData(RecordMetaData recordMetaData) {
    if(!(recordMetaData instanceof RecordMetaDataIndex)) {
        throw new IllegalArgumentException("Unable to load from meta data. No index specified for record");
    }

    RecordMetaDataIndex index = (RecordMetaDataIndex) recordMetaData;
    InputSplit fileSplit = new FileSplit(new File(index.getURI()));
    initialize(fileSplit);
    this.currIdx = (int) index.getIndex();
    return nextRecord();
}
 
Example #22
Source File: TestSequenceRecordReaderBytesFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRecordReaderBytesFunction() throws Exception {

    //Local file path
    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    //Load binary data from local file system, convert to a sequence file:
    //Load and convert
    JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path);
    JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction());
    //Write the sequence file:
    Path p = Files.createTempDirectory("dl4j_rrbytesTest");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);

    //Load data from sequence file, parse via SequenceRecordReader:
    JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class);
    SequenceRecordReader seqRR = new CodecRecordReader();
    Configuration conf = new Configuration();
    conf.set(CodecRecordReader.RAVEL, "true");
    conf.set(CodecRecordReader.START_FRAME, "0");
    conf.set(CodecRecordReader.TOTAL_FRAMES, "25");
    conf.set(CodecRecordReader.ROWS, "64");
    conf.set(CodecRecordReader.COLUMNS, "64");
    Configuration confCopy = new Configuration(conf);
    seqRR.setConf(conf);
    JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR));



    //Next: do the same thing locally, and compare the results
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = new CodecRecordReader();
    srr.initialize(is);
    srr.setConf(confCopy);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }
    assertEquals(4, list.size());

    List<List<List<Writable>>> fromSequenceFile = dataVecData.collect();

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        List<List<Writable>> collection = fromSequenceFile.get(i);
        for (int j = 0; j < 4; j++) {
            if (collection.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions
}
 
Example #23
Source File: ArrowRecordWriter.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception {
    this.partitioner = partitioner;
    partitioner.init(inputSplit);

}
 
Example #24
Source File: ExcelRecordReader.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    super.initialize(conf, split);
    this.skipNumLines = conf.getInt(SKIP_NUM_LINES,0);
}
 
Example #25
Source File: CSVInputFormat.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader createReader(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    return createReader(split);
}
 
Example #26
Source File: CSVNLinesSequenceRecordReader.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException {
    super.initialize(conf, split);
    this.nLinesPerSequence = conf.getInt(LINES_PER_SEQUENCE, nLinesPerSequence);
}
 
Example #27
Source File: MapFileSequenceRecordReader.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit split) throws IOException, InterruptedException {
    initialize(null, split);
}
 
Example #28
Source File: BaseInputFormat.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader createReader(InputSplit split) throws IOException, InterruptedException {
    return createReader(split, null);
}
 
Example #29
Source File: TestPairSequenceRecordReaderBytesFunction.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
    //Goal: combine separate files together into a hadoop sequence file, for later parsing by a SequenceRecordReader
    //For example: use to combine input and labels data from separate files for training a RNN
    JavaSparkContext sc = getContext();

    File f = testDir.newFolder();
    new ClassPathResource("datavec-spark/video/").copyDirectory(f);
    String path = f.getAbsolutePath() + "/*";

    PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter);

    Path p = Files.createTempDirectory("dl4j_rrbytesPairOut");
    p.toFile().deleteOnExit();
    String outPath = p.toString() + "/out";
    new File(outPath).deleteOnExit();
    toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);

    //Load back into memory:
    JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class);

    SequenceRecordReader srr1 = getReader();
    SequenceRecordReader srr2 = getReader();
    PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2);

    JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf);
    List<Tuple2<List<List<Writable>>, List<List<Writable>>>> fromSequenceFile = writables.collect();

    //Load manually (single copy) and compare:
    InputSplit is = new FileSplit(f, new String[] {"mp4"}, true);
    SequenceRecordReader srr = getReader();
    srr.initialize(is);

    List<List<List<Writable>>> list = new ArrayList<>(4);
    while (srr.hasNext()) {
        list.add(srr.sequenceRecord());
    }

    assertEquals(4, list.size());
    assertEquals(4, fromSequenceFile.size());

    boolean[] found = new boolean[4];
    for (int i = 0; i < 4; i++) {
        int foundIndex = -1;
        Tuple2<List<List<Writable>>, List<List<Writable>>> tuple2 = fromSequenceFile.get(i);
        List<List<Writable>> seq1 = tuple2._1();
        List<List<Writable>> seq2 = tuple2._2();
        assertEquals(seq1, seq2);

        for (int j = 0; j < 4; j++) {
            if (seq1.equals(list.get(j))) {
                if (foundIndex != -1)
                    fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen)
                foundIndex = j;
                if (found[foundIndex])
                    fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list
                found[foundIndex] = true; //mark this one as seen before
            }
        }
    }
    int count = 0;
    for (boolean b : found)
        if (b)
            count++;
    assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions

}
 
Example #30
Source File: ArrowRecordReader.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit split) {
    this.split = split;
    this.pathsIter = split.locationsPathIterator();
}