org.datavec.api.split.InputSplit Java Examples
The following examples show how to use
org.datavec.api.split.InputSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JacksonRecordReaderTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testReadingJson() throws Exception { //Load 3 values from 3 JSON files //stricture: a:value, b:value, c:x:value, c:y:value //And we want to load only a:value, b:value and c:x:value //For first JSON file: all values are present //For second JSON file: b:value is missing //For third JSON file: c:x:value is missing ClassPathResource cpr = new ClassPathResource("datavec-api/json/"); File f = testDir.newFolder(); cpr.copyDirectory(f); String path = new File(f, "json_test_%d.txt").getAbsolutePath(); InputSplit is = new NumberedFileInputSplit(path, 0, 2); RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory())); rr.initialize(is); testJacksonRecordReader(rr); }
Example #2
Source File: BaseImageRecordReader.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Override public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException { this.appendLabel = conf.getBoolean(APPEND_LABEL, appendLabel); this.labels = new ArrayList<>(conf.getStringCollection(LABELS)); this.height = conf.getLong(HEIGHT, height); this.width = conf.getLong(WIDTH, width); this.channels = conf.getLong(CHANNELS, channels); this.cropImage = conf.getBoolean(CROP_IMAGE, cropImage); if ("imageio".equals(conf.get(IMAGE_LOADER))) { this.imageLoader = new ImageLoader(height, width, channels, cropImage); } else { this.imageLoader = new NativeImageLoader(height, width, channels, imageTransform); } this.conf = conf; initialize(split); }
Example #3
Source File: LineReaderTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testLineReaderWithInputStreamInputSplit() throws Exception { File tmpdir = testDir.newFolder(); File tmp1 = new File(tmpdir, "tmp1.txt.gz"); OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false)); IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os); os.flush(); os.close(); InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1))); RecordReader reader = new LineRecordReader(); reader.initialize(split); int count = 0; while (reader.hasNext()) { assertEquals(1, reader.next().size()); count++; } assertEquals(9, count); }
Example #4
Source File: ListStringInputMarshaller.java From DataVec with Apache License 2.0 | 5 votes |
/** * @param exchange * @return */ @Override public InputSplit getSplit(Exchange exchange) { List<List<String>> data = (List<List<String>>) exchange.getIn().getBody(); InputSplit listSplit = new ListStringSplit(data); return listSplit; }
Example #5
Source File: ListStringInputFormat.java From DataVec with Apache License 2.0 | 5 votes |
/** * Creates a reader from an input split * * @param split the split to read * @return the reader from the given input split */ @Override public RecordReader createReader(InputSplit split) throws IOException, InterruptedException { RecordReader reader = new ListStringRecordReader(); reader.initialize(split); return reader; }
Example #6
Source File: JacksonRecordReader.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split) throws IOException, InterruptedException { if (split instanceof FileSplit) throw new UnsupportedOperationException("Cannot use JacksonRecordReader with FileSplit"); super.initialize(inputSplit); this.uris = split.locations(); if (shuffle) { List<URI> list = Arrays.asList(uris); Collections.shuffle(list, r); uris = list.toArray(new URI[uris.length]); } }
Example #7
Source File: FileRecordWriter.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception { partitioner.init(inputSplit); out = new DataOutputStream(partitioner.currentOutputStream()); this.partitioner = partitioner; }
Example #8
Source File: FileRecordReader.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException { appendLabel = conf.getBoolean(APPEND_LABEL, true); doInitialize(split); this.inputSplit = split; this.conf = conf; }
Example #9
Source File: FileRecordWriter.java From DataVec with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception { partitioner.init(inputSplit); out = new DataOutputStream(partitioner.currentOutputStream()); this.partitioner = partitioner; }
Example #10
Source File: LineInputFormat.java From DataVec with Apache License 2.0 | 5 votes |
@Override public RecordReader createReader(InputSplit split) throws IOException, InterruptedException { LineRecordReader ret = new LineRecordReader(); ret.initialize(split); return ret; }
Example #11
Source File: ExcelRecordWriter.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(Configuration configuration, InputSplit split, Partitioner partitioner) throws Exception { this.workBookName = configuration.get(WORKSHEET_NAME,DEFAULT_WORKSHEET_NAME); this.fileTypeToUse = configuration.get(FILE_TYPE,DEFAULT_FILE_TYPE); this.conf = configuration; partitioner.init(split); out = new DataOutputStream(partitioner.currentOutputStream()); initPoi(); }
Example #12
Source File: LineReaderTest.java From DataVec with Apache License 2.0 | 5 votes |
@Test public void testLineReader() throws Exception { String tempDir = System.getProperty("java.io.tmpdir"); File tmpdir = new File(tempDir, "tmpdir-testLineReader"); if (tmpdir.exists()) tmpdir.delete(); tmpdir.mkdir(); File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt")); File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt")); File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt")); FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3")); FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6")); FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9")); InputSplit split = new FileSplit(tmpdir); RecordReader reader = new LineRecordReader(); reader.initialize(split); int count = 0; List<List<Writable>> list = new ArrayList<>(); while (reader.hasNext()) { List<Writable> l = reader.next(); assertEquals(1, l.size()); list.add(l); count++; } assertEquals(9, count); try { FileUtils.deleteDirectory(tmpdir); } catch (Exception e) { e.printStackTrace(); } }
Example #13
Source File: VideoRecordReader.java From DataVec with Apache License 2.0 | 5 votes |
@Override public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException { this.conf = conf; this.appendLabel = conf.getBoolean(APPEND_LABEL, false); this.height = conf.getInt(HEIGHT, height); this.width = conf.getInt(WIDTH, width); if ("imageio".equals(conf.get(IMAGE_LOADER))) { this.imageLoader = new ImageLoader(height, width); } else { this.imageLoader = new NativeImageLoader(height, width); } initialize(split); }
Example #14
Source File: BaseRecordReader.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split) throws IOException, InterruptedException { this.inputSplit = split; if(split instanceof StreamInputSplit){ StreamInputSplit s = (StreamInputSplit)split; if(s.getStreamCreatorFn() != null){ this.streamCreatorFn = s.getStreamCreatorFn(); } } }
Example #15
Source File: LFWLoader.java From deeplearning4j with Apache License 2.0 | 5 votes |
public RecordReader getRecordReader(long batchSize, long numExamples, long[] imgDim, long numLabels, PathLabelGenerator labelGenerator, boolean train, double splitTrainTest, Random rng) { load(batchSize, numExamples, numLabels, labelGenerator, splitTrainTest, rng); RecordReader recordReader = new ImageRecordReader(imgDim[0], imgDim[1], imgDim[2], labelGenerator, imageTransform); try { InputSplit data = train ? inputSplit[0] : inputSplit[1]; recordReader.initialize(data); } catch (IOException | InterruptedException e) { log.error("",e); } return recordReader; }
Example #16
Source File: LineReaderTest.java From DataVec with Apache License 2.0 | 5 votes |
@Test public void testLineReaderWithInputStreamInputSplit() throws Exception { String tempDir = System.getProperty("java.io.tmpdir"); File tmpdir = new File(tempDir, "tmpdir"); tmpdir.mkdir(); File tmp1 = new File(tmpdir, "tmp1.txt.gz"); OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false)); IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os); os.flush(); os.close(); InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1))); RecordReader reader = new LineRecordReader(); reader.initialize(split); int count = 0; while (reader.hasNext()) { assertEquals(1, reader.next().size()); count++; } assertEquals(9, count); try { FileUtils.deleteDirectory(tmpdir); } catch (Exception e) { e.printStackTrace(); } }
Example #17
Source File: DataVecConsumer.java From DataVec with Apache License 2.0 | 5 votes |
@Override protected int poll() throws Exception { Exchange exchange = endpoint.createExchange(); InputSplit split = inputFromExchange(exchange); RecordReader reader = inputFormat.createReader(split, configuration); int numMessagesPolled = 0; while (reader.hasNext()) { // create a message body while (reader.hasNext()) { exchange.getIn().setBody(reader.next()); try { // send message to next processor in the route getProcessor().process(exchange); numMessagesPolled++; // number of messages polled } finally { // log exception if an exception occurred and was not handled if (exchange.getException() != null) { getExceptionHandler().handleException("Error processing exchange", exchange, exchange.getException()); } } } } return numMessagesPolled; }
Example #18
Source File: RegexRecordReaderTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testRegexSequenceRecordReaderMeta() throws Exception { String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)"; ClassPathResource cpr = new ClassPathResource("datavec-api/logtestdata/"); File f = testDir.newFolder(); cpr.copyDirectory(f); String path = new File(f, "logtestfile%d.txt").getAbsolutePath(); InputSplit is = new NumberedFileInputSplit(path, 0, 1); SequenceRecordReader rr = new RegexSequenceRecordReader(regex, 1); rr.initialize(is); List<List<List<Writable>>> out = new ArrayList<>(); while (rr.hasNext()) { out.add(rr.sequenceRecord()); } assertEquals(2, out.size()); List<List<List<Writable>>> out2 = new ArrayList<>(); List<SequenceRecord> out3 = new ArrayList<>(); List<RecordMetaData> meta = new ArrayList<>(); rr.reset(); while (rr.hasNext()) { SequenceRecord seqr = rr.nextSequence(); out2.add(seqr.getSequenceRecord()); out3.add(seqr); meta.add(seqr.getMetaData()); } List<SequenceRecord> fromMeta = rr.loadSequenceFromMetaData(meta); assertEquals(out, out2); assertEquals(out3, fromMeta); }
Example #19
Source File: ExcelRecordWriter.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception { this.conf = new Configuration(); this.partitioner = partitioner; partitioner.init(inputSplit); out = new DataOutputStream(partitioner.currentOutputStream()); initPoi(); }
Example #20
Source File: FileRecordReaderTest.java From DataVec with Apache License 2.0 | 5 votes |
@Test public void testMeta() throws Exception { FileRecordReader rr = new FileRecordReader(); URI[] arr = new URI[3]; arr[0] = new ClassPathResource("csvsequence_0.txt").getFile().toURI(); arr[1] = new ClassPathResource("csvsequence_1.txt").getFile().toURI(); arr[2] = new ClassPathResource("csvsequence_2.txt").getFile().toURI(); InputSplit is = new CollectionInputSplit(Arrays.asList(arr)); rr.initialize(is); List<List<Writable>> out = new ArrayList<>(); while (rr.hasNext()) { out.add(rr.next()); } assertEquals(3, out.size()); rr.reset(); List<List<Writable>> out2 = new ArrayList<>(); List<Record> out3 = new ArrayList<>(); List<RecordMetaData> meta = new ArrayList<>(); int count = 0; while (rr.hasNext()) { Record r = rr.nextRecord(); out2.add(r.getRecord()); out3.add(r); meta.add(r.getMetaData()); assertEquals(arr[count++], r.getMetaData().getURI()); } assertEquals(out, out2); List<Record> fromMeta = rr.loadFromMetaData(meta); assertEquals(out3, fromMeta); }
Example #21
Source File: ArrowRecordReader.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public Record loadFromMetaData(RecordMetaData recordMetaData) { if(!(recordMetaData instanceof RecordMetaDataIndex)) { throw new IllegalArgumentException("Unable to load from meta data. No index specified for record"); } RecordMetaDataIndex index = (RecordMetaDataIndex) recordMetaData; InputSplit fileSplit = new FileSplit(new File(index.getURI())); initialize(fileSplit); this.currIdx = (int) index.getIndex(); return nextRecord(); }
Example #22
Source File: TestSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testRecordReaderBytesFunction() throws Exception { //Local file path File f = testDir.newFolder(); new ClassPathResource("datavec-spark/video/").copyDirectory(f); String path = f.getAbsolutePath() + "/*"; //Load binary data from local file system, convert to a sequence file: //Load and convert JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path); JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction()); //Write the sequence file: Path p = Files.createTempDirectory("dl4j_rrbytesTest"); p.toFile().deleteOnExit(); String outPath = p.toString() + "/out"; filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class); //Load data from sequence file, parse via SequenceRecordReader: JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class); SequenceRecordReader seqRR = new CodecRecordReader(); Configuration conf = new Configuration(); conf.set(CodecRecordReader.RAVEL, "true"); conf.set(CodecRecordReader.START_FRAME, "0"); conf.set(CodecRecordReader.TOTAL_FRAMES, "25"); conf.set(CodecRecordReader.ROWS, "64"); conf.set(CodecRecordReader.COLUMNS, "64"); Configuration confCopy = new Configuration(conf); seqRR.setConf(conf); JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR)); //Next: do the same thing locally, and compare the results InputSplit is = new FileSplit(f, new String[] {"mp4"}, true); SequenceRecordReader srr = new CodecRecordReader(); srr.initialize(is); srr.setConf(confCopy); List<List<List<Writable>>> list = new ArrayList<>(4); while (srr.hasNext()) { list.add(srr.sequenceRecord()); } assertEquals(4, list.size()); List<List<List<Writable>>> fromSequenceFile = dataVecData.collect(); assertEquals(4, list.size()); assertEquals(4, fromSequenceFile.size()); boolean[] found = new boolean[4]; for (int i = 0; i < 4; i++) { int foundIndex = -1; List<List<Writable>> collection = fromSequenceFile.get(i); for (int j = 0; j < 4; j++) { if (collection.equals(list.get(j))) { if (foundIndex != -1) fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen) foundIndex = j; if (found[foundIndex]) fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list found[foundIndex] = true; //mark this one as seen before } } } int count = 0; for (boolean b : found) if (b) count++; assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions }
Example #23
Source File: ArrowRecordWriter.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public void initialize(InputSplit inputSplit, Partitioner partitioner) throws Exception { this.partitioner = partitioner; partitioner.init(inputSplit); }
Example #24
Source File: ExcelRecordReader.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException { super.initialize(conf, split); this.skipNumLines = conf.getInt(SKIP_NUM_LINES,0); }
Example #25
Source File: CSVInputFormat.java From DataVec with Apache License 2.0 | 4 votes |
@Override public RecordReader createReader(InputSplit split, Configuration conf) throws IOException, InterruptedException { return createReader(split); }
Example #26
Source File: CSVNLinesSequenceRecordReader.java From DataVec with Apache License 2.0 | 4 votes |
@Override public void initialize(Configuration conf, InputSplit split) throws IOException, InterruptedException { super.initialize(conf, split); this.nLinesPerSequence = conf.getInt(LINES_PER_SEQUENCE, nLinesPerSequence); }
Example #27
Source File: MapFileSequenceRecordReader.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public void initialize(InputSplit split) throws IOException, InterruptedException { initialize(null, split); }
Example #28
Source File: BaseInputFormat.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public RecordReader createReader(InputSplit split) throws IOException, InterruptedException { return createReader(split, null); }
Example #29
Source File: TestPairSequenceRecordReaderBytesFunction.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void test() throws Exception { //Goal: combine separate files together into a hadoop sequence file, for later parsing by a SequenceRecordReader //For example: use to combine input and labels data from separate files for training a RNN JavaSparkContext sc = getContext(); File f = testDir.newFolder(); new ClassPathResource("datavec-spark/video/").copyDirectory(f); String path = f.getAbsolutePath() + "/*"; PathToKeyConverter pathConverter = new PathToKeyConverterFilename(); JavaPairRDD<Text, BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path, path, pathConverter); Path p = Files.createTempDirectory("dl4j_rrbytesPairOut"); p.toFile().deleteOnExit(); String outPath = p.toString() + "/out"; new File(outPath).deleteOnExit(); toWrite.saveAsNewAPIHadoopFile(outPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class); //Load back into memory: JavaPairRDD<Text, BytesPairWritable> fromSeq = sc.sequenceFile(outPath, Text.class, BytesPairWritable.class); SequenceRecordReader srr1 = getReader(); SequenceRecordReader srr2 = getReader(); PairSequenceRecordReaderBytesFunction psrbf = new PairSequenceRecordReaderBytesFunction(srr1, srr2); JavaRDD<Tuple2<List<List<Writable>>, List<List<Writable>>>> writables = fromSeq.map(psrbf); List<Tuple2<List<List<Writable>>, List<List<Writable>>>> fromSequenceFile = writables.collect(); //Load manually (single copy) and compare: InputSplit is = new FileSplit(f, new String[] {"mp4"}, true); SequenceRecordReader srr = getReader(); srr.initialize(is); List<List<List<Writable>>> list = new ArrayList<>(4); while (srr.hasNext()) { list.add(srr.sequenceRecord()); } assertEquals(4, list.size()); assertEquals(4, fromSequenceFile.size()); boolean[] found = new boolean[4]; for (int i = 0; i < 4; i++) { int foundIndex = -1; Tuple2<List<List<Writable>>, List<List<Writable>>> tuple2 = fromSequenceFile.get(i); List<List<Writable>> seq1 = tuple2._1(); List<List<Writable>> seq2 = tuple2._2(); assertEquals(seq1, seq2); for (int j = 0; j < 4; j++) { if (seq1.equals(list.get(j))) { if (foundIndex != -1) fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen) foundIndex = j; if (found[foundIndex]) fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list found[foundIndex] = true; //mark this one as seen before } } } int count = 0; for (boolean b : found) if (b) count++; assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions }
Example #30
Source File: ArrowRecordReader.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public void initialize(InputSplit split) { this.split = split; this.pathsIter = split.locationsPathIterator(); }