Java Code Examples for org.datavec.api.records.reader.RecordReader#nextRecord()

The following examples show how to use org.datavec.api.records.reader.RecordReader#nextRecord() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
       while (reader.hasNext()) {
           Record record = reader.nextRecord();
           String s = record.getRecord().get(0).toString();
           Tokenizer tokenizer = tokenizerFactory.create(s);
           cache.incrementNumDocs(1);
           List<String> tokens = new ArrayList<String>(); //These tokens might be different from those of the tokenizer if used with stopwords
           if(stopWords==null)
           	tokens=doWithTokens(tokenizer);
           else
           	tokens=doWithTokensStopWords(tokenizer);
           if(maxNgrams>1)
           	doWithNgram(ngramsGenerator(tokens));
           if (callBack != null)
               callBack.onRecord(record);
       }

}
 
Example 2
Source File: VasttextDataIterator.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
@Override
public MultiDataSet next(int num) {
	if (!hasNext())
		throw new NoSuchElementException("No next elements");

	// First: load the next values from the RR / SeqRRs
	Map<String, List<List<Writable>>> nextRRVals = new HashMap<>();
	List<RecordMetaDataComposableMap> nextMetas = (collectMetaData ? new ArrayList<RecordMetaDataComposableMap>()
			: null);

	for (Map.Entry<String, RecordReader> entry : recordReaders.entrySet()) {
		RecordReader rr = entry.getValue();
		// Standard case
			List<List<Writable>> writables = new ArrayList<>(Math.min(num, 100000)); // Min op: in case user puts
																						// batch size >> amount of
																						// data
			for (int i = 0; i < num && rr.hasNext(); i++) 
			{
				List<Writable> record;
				if (collectMetaData) {
					Record r = rr.nextRecord();
					record = r.getRecord();
					if (nextMetas.size() <= i) {
						nextMetas.add(new RecordMetaDataComposableMap(new HashMap<String, RecordMetaData>()));
					}
					RecordMetaDataComposableMap map = nextMetas.get(i);
					map.getMeta().put(entry.getKey(), r.getMetaData());
				} else {
					record = rr.next();
				}
				writables.add(record);
			}

			nextRRVals.put(entry.getKey(), writables);
	}

	return nextMultiDataSet(nextRRVals, nextMetas);
}
 
Example 3
Source File: TextVectorizer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        cache.incrementNumDocs(1);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);


    }
}
 
Example 4
Source File: ArrowConverterTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderMetaDataList() throws Exception {
    val recordsToWrite = recordToWrite();
    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    RecordReader recordReader = new ArrowRecordReader();
    RecordMetaDataIndex recordMetaDataIndex = new RecordMetaDataIndex(0,tmp.toURI(),ArrowRecordReader.class);
    recordReader.loadFromMetaData(Arrays.<RecordMetaData>asList(recordMetaDataIndex));

    Record record = recordReader.nextRecord();
    assertEquals(2,record.getRecord().size());

}
 
Example 5
Source File: ArrowConverterTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderMetaData() throws Exception {
    val recordsToWrite = recordToWrite();
    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    RecordReader recordReader = new ArrowRecordReader();
    RecordMetaDataIndex recordMetaDataIndex = new RecordMetaDataIndex(0,tmp.toURI(),ArrowRecordReader.class);
    recordReader.loadFromMetaData(recordMetaDataIndex);

    Record record = recordReader.nextRecord();
    assertEquals(2,record.getRecord().size());
}
 
Example 6
Source File: JacksonRecordReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testAppendingLabelsMetaData() throws Exception {
    ClassPathResource cpr = new ClassPathResource("json/json_test_0.txt");
    String path = cpr.getFile().getAbsolutePath().replace("0", "%d");

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    //Insert at the end:
    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()), false, -1,
                    new LabelGen());
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }
    assertEquals(3, out.size());

    rr.reset();

    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> outRecord = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        outRecord.add(r);
        meta.add(r.getMetaData());
    }

    assertEquals(out, out2);

    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(outRecord, fromMeta);
}
 
Example 7
Source File: RegexRecordReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testRegexLineRecordReaderMeta() throws Exception {
    String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)";

    RecordReader rr = new RegexLineRecordReader(regex, 1);
    rr.initialize(new FileSplit(new ClassPathResource("/logtestdata/logtestfile0.txt").getFile()));

    List<List<Writable>> list = new ArrayList<>();
    while (rr.hasNext()) {
        list.add(rr.next());
    }
    assertEquals(3, list.size());

    List<Record> list2 = new ArrayList<>();
    List<List<Writable>> list3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    rr.reset();
    int count = 1; //Start by skipping 1 line
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        list2.add(r);
        list3.add(r.getRecord());
        meta.add(r.getMetaData());

        assertEquals(count++, ((RecordMetaDataLine) r.getMetaData()).getLineNumber());
    }

    List<Record> fromMeta = rr.loadFromMetaData(meta);

    assertEquals(list, list3);
    assertEquals(list2, fromMeta);
}
 
Example 8
Source File: TextVectorizer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
    while (reader.hasNext()) {
        Record record = reader.nextRecord();
        String s = toString(record.getRecord());
        Tokenizer tokenizer = tokenizerFactory.create(s);
        doWithTokens(tokenizer);
        if (callBack != null)
            callBack.onRecord(record);
        cache.incrementNumDocs(1);
    }
}
 
Example 9
Source File: ArrowConverterTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderMetaDataList() throws Exception {
    val recordsToWrite = recordToWrite();
    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    RecordReader recordReader = new ArrowRecordReader();
    RecordMetaDataIndex recordMetaDataIndex = new RecordMetaDataIndex(0,tmp.toURI(),ArrowRecordReader.class);
    recordReader.loadFromMetaData(Arrays.<RecordMetaData>asList(recordMetaDataIndex));

    Record record = recordReader.nextRecord();
    assertEquals(2,record.getRecord().size());

}
 
Example 10
Source File: ArrowConverterTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordReaderMetaData() throws Exception {
    val recordsToWrite = recordToWrite();
    //send file
    File tmp =  tmpDataFile(recordsToWrite);
    RecordReader recordReader = new ArrowRecordReader();
    RecordMetaDataIndex recordMetaDataIndex = new RecordMetaDataIndex(0,tmp.toURI(),ArrowRecordReader.class);
    recordReader.loadFromMetaData(recordMetaDataIndex);

    Record record = recordReader.nextRecord();
    assertEquals(2,record.getRecord().size());
}
 
Example 11
Source File: JacksonRecordReaderTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testAppendingLabelsMetaData() throws Exception {
    ClassPathResource cpr = new ClassPathResource("datavec-api/json/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "json_test_%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    //Insert at the end:
    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()), false, -1,
                    new LabelGen());
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }
    assertEquals(3, out.size());

    rr.reset();

    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> outRecord = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        outRecord.add(r);
        meta.add(r.getMetaData());
    }

    assertEquals(out, out2);

    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(outRecord, fromMeta);
}
 
Example 12
Source File: RegexRecordReaderTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testRegexLineRecordReaderMeta() throws Exception {
    String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)";

    RecordReader rr = new RegexLineRecordReader(regex, 1);
    rr.initialize(new FileSplit(new ClassPathResource("datavec-api/logtestdata/logtestfile0.txt").getFile()));

    List<List<Writable>> list = new ArrayList<>();
    while (rr.hasNext()) {
        list.add(rr.next());
    }
    assertEquals(3, list.size());

    List<Record> list2 = new ArrayList<>();
    List<List<Writable>> list3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    rr.reset();
    int count = 1; //Start by skipping 1 line
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        list2.add(r);
        list3.add(r.getRecord());
        meta.add(r.getMetaData());

        assertEquals(count++, ((RecordMetaDataLine) r.getMetaData()).getLineNumber());
    }

    List<Record> fromMeta = rr.loadFromMetaData(meta);

    assertEquals(list, list3);
    assertEquals(list2, fromMeta);
}
 
Example 13
Source File: LineReaderTest.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Test
public void testLineReaderMetaData() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir-testLineReader");
    if (tmpdir.exists())
        tmpdir.delete();
    tmpdir.mkdir();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        list.add(reader.next());
    }
    assertEquals(9, list.size());


    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    reader.reset();
    int count = 0;
    while (reader.hasNext()) {
        Record r = reader.nextRecord();
        out2.add(r.getRecord());
        out3.add(r);
        meta.add(r.getMetaData());
        int fileIdx = count / 3;
        URI uri = r.getMetaData().getURI();
        assertEquals(uri, split.locations()[fileIdx]);
        count++;
    }

    assertEquals(list, out2);

    List<Record> fromMeta = reader.loadFromMetaData(meta);
    assertEquals(out3, fromMeta);

    //try: second line of second and third files only...
    List<RecordMetaData> subsetMeta = new ArrayList<>();
    subsetMeta.add(meta.get(4));
    subsetMeta.add(meta.get(7));
    List<Record> subset = reader.loadFromMetaData(subsetMeta);
    assertEquals(2, subset.size());
    assertEquals(out3.get(4), subset.get(0));
    assertEquals(out3.get(7), subset.get(1));


    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 14
Source File: LineReaderTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testLineReaderMetaData() throws Exception {
    File tmpdir = testDir.newFolder();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        list.add(reader.next());
    }
    assertEquals(9, list.size());


    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    reader.reset();
    int count = 0;
    while (reader.hasNext()) {
        Record r = reader.nextRecord();
        out2.add(r.getRecord());
        out3.add(r);
        meta.add(r.getMetaData());
        int fileIdx = count / 3;
        URI uri = r.getMetaData().getURI();
        assertEquals(uri, split.locations()[fileIdx]);
        count++;
    }

    assertEquals(list, out2);

    List<Record> fromMeta = reader.loadFromMetaData(meta);
    assertEquals(out3, fromMeta);

    //try: second line of second and third files only...
    List<RecordMetaData> subsetMeta = new ArrayList<>();
    subsetMeta.add(meta.get(4));
    subsetMeta.add(meta.get(7));
    List<Record> subset = reader.loadFromMetaData(subsetMeta);
    assertEquals(2, subset.size());
    assertEquals(out3.get(4), subset.get(0));
    assertEquals(out3.get(7), subset.get(1));
}