Java Code Examples for org.datavec.api.records.reader.RecordReader#hasNext()

The following examples show how to use org.datavec.api.records.reader.RecordReader#hasNext() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestConcatenatingRecordReader.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {

    CSVRecordReader rr = new CSVRecordReader(0, ',');
    rr.initialize(new FileSplit(new ClassPathResource("datavec-api/iris.dat").getFile()));

    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(new ClassPathResource("datavec-api/iris.dat").getFile()));

    RecordReader rrC = new ConcatenatingRecordReader(rr, rr2);

    int count = 0;
    while(rrC.hasNext()){
        rrC.next();
        count++;
    }

    assertEquals(300, count);
}
 
Example 2
Source File: VasttextTextVectorizer.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
@Override
public void fit(RecordReader reader, RecordCallBack callBack) {
       while (reader.hasNext()) {
           Record record = reader.nextRecord();
           String s = record.getRecord().get(0).toString();
           Tokenizer tokenizer = tokenizerFactory.create(s);
           cache.incrementNumDocs(1);
           List<String> tokens = new ArrayList<String>(); //These tokens might be different from those of the tokenizer if used with stopwords
           if(stopWords==null)
           	tokens=doWithTokens(tokenizer);
           else
           	tokens=doWithTokensStopWords(tokenizer);
           if(maxNgrams>1)
           	doWithNgram(ngramsGenerator(tokens));
           if (callBack != null)
               callBack.onRecord(record);
       }

}
 
Example 3
Source File: TestSerialization.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Test
public void testCsvRRSerializationResults() throws Exception {
    int skipLines = 3;
    RecordReader r1 = new CSVRecordReader(skipLines, '\t');
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ObjectOutputStream os = new ObjectOutputStream(baos);
    os.writeObject(r1);
    byte[] bytes = baos.toByteArray();
    ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
    RecordReader r2 = (RecordReader) ois.readObject();

    File f = new ClassPathResource("iris_tab_delim.txt").getFile();

    r1.initialize(new FileSplit(f));
    r2.initialize(new FileSplit(f));

    int count = 0;
    while(r1.hasNext()){
        List<Writable> n1 = r1.next();
        List<Writable> n2 = r2.next();
        assertEquals(n1, n2);
        count++;
    }

    assertEquals(150-skipLines, count);
}
 
Example 4
Source File: TestConcatenatingRecordReader.java    From DataVec with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {

    CSVRecordReader rr = new CSVRecordReader(0, ',');
    rr.initialize(new FileSplit(new ClassPathResource("iris.dat").getFile()));

    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(new ClassPathResource("iris.dat").getFile()));

    RecordReader rrC = new ConcatenatingRecordReader(rr, rr2);

    int count = 0;
    while(rrC.hasNext()){
        rrC.next();
        count++;
    }

    assertEquals(300, count);
}
 
Example 5
Source File: LineReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineReader() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir-testLineReader");
    if (tmpdir.exists())
        tmpdir.delete();
    tmpdir.mkdir();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        List<Writable> l = reader.next();
        assertEquals(1, l.size());
        list.add(l);
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 6
Source File: AnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Analyse the specified data - returns a DataAnalysis object with summary information about each column
 *
 * @param schema Schema for data
 * @param rr     Data to analyze
 * @return DataAnalysis for data
 */
public static DataAnalysis analyze(Schema schema, RecordReader rr, int maxHistogramBuckets){
    AnalysisAddFunction addFn = new AnalysisAddFunction(schema);
    List<AnalysisCounter> counters = null;
    while(rr.hasNext()){
        counters = addFn.apply(counters, rr.next());
    }

    double[][] minsMaxes = new double[counters.size()][2];

    List<ColumnType> columnTypes = schema.getColumnTypes();
    List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes);


    //Do another pass collecting histogram values:
    List<HistogramCounter> histogramCounters = null;
    HistogramAddFunction add = new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes);
    if(rr.resetSupported()){
        rr.reset();
        while(rr.hasNext()){
            histogramCounters = add.apply(histogramCounters, rr.next());
        }

        DataVecAnalysisUtils.mergeCounters(list, histogramCounters);
    }

    return new DataAnalysis(schema, list);
}
 
Example 7
Source File: JacksonLineRecordReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
private static void testJacksonRecordReader(RecordReader rr) {
	while (rr.hasNext()) {
    	List<Writable> json0 = rr.next();
    	//System.out.println(json0);
    	assert(json0.size() > 0);
	}
}
 
Example 8
Source File: TestAnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testAnalysisBasic() throws Exception {

    RecordReader rr = new CSVRecordReader();
    rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));

    Schema s = new Schema.Builder()
            .addColumnsDouble("0", "1", "2", "3")
            .addColumnInteger("label")
            .build();

    DataAnalysis da = AnalyzeLocal.analyze(s, rr);

    System.out.println(da);

    //Compare:
    List<List<Writable>> list = new ArrayList<>();
    rr.reset();
    while(rr.hasNext()){
        list.add(rr.next());
    }

    INDArray arr = RecordConverter.toMatrix(DataType.DOUBLE, list);
    INDArray mean = arr.mean(0);
    INDArray std = arr.std(0);

    for( int i=0; i<5; i++ ){
        double m = ((NumericalColumnAnalysis)da.getColumnAnalysis().get(i)).getMean();
        double stddev = ((NumericalColumnAnalysis)da.getColumnAnalysis().get(i)).getSampleStdev();
        assertEquals(mean.getDouble(i), m, 1e-3);
        assertEquals(std.getDouble(i), stddev, 1e-3);
    }

}
 
Example 9
Source File: JacksonRecordReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testAppendingLabelsMetaData() throws Exception {
    ClassPathResource cpr = new ClassPathResource("json/json_test_0.txt");
    String path = cpr.getFile().getAbsolutePath().replace("0", "%d");

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    //Insert at the end:
    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()), false, -1,
                    new LabelGen());
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }
    assertEquals(3, out.size());

    rr.reset();

    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> outRecord = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        outRecord.add(r);
        meta.add(r.getMetaData());
    }

    assertEquals(out, out2);

    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(outRecord, fromMeta);
}
 
Example 10
Source File: RegexRecordReaderTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testRegexLineRecordReaderMeta() throws Exception {
    String regex = "(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}) (\\d+) ([A-Z]+) (.*)";

    RecordReader rr = new RegexLineRecordReader(regex, 1);
    rr.initialize(new FileSplit(new ClassPathResource("datavec-api/logtestdata/logtestfile0.txt").getFile()));

    List<List<Writable>> list = new ArrayList<>();
    while (rr.hasNext()) {
        list.add(rr.next());
    }
    assertEquals(3, list.size());

    List<Record> list2 = new ArrayList<>();
    List<List<Writable>> list3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    rr.reset();
    int count = 1; //Start by skipping 1 line
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        list2.add(r);
        list3.add(r.getRecord());
        meta.add(r.getMetaData());

        assertEquals(count++, ((RecordMetaDataLine) r.getMetaData()).getLineNumber());
    }

    List<Record> fromMeta = rr.loadFromMetaData(meta);

    assertEquals(list, list3);
    assertEquals(list2, fromMeta);
}
 
Example 11
Source File: RecordReaderConverter.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Write all values from the specified record reader to the specified record writer.
 * Optionally, close the record writer on completion
 *
 * @param reader Record reader (source of data)
 * @param writer Record writer (location to write data)
 * @param closeOnCompletion if true: close the record writer once complete, via {@link RecordWriter#close()}
 * @throws IOException If underlying reader/writer throws an exception
 */
public static void convert(RecordReader reader, RecordWriter writer, boolean closeOnCompletion) throws IOException {

    if(!reader.hasNext()){
        throw new UnsupportedOperationException("Cannot convert RecordReader: reader has no next element");
    }

    while(reader.hasNext()){
        writer.write(reader.next());
    }

    if(closeOnCompletion){
        writer.close();
    }
}
 
Example 12
Source File: TransformProcess.java    From DataVec with Apache License 2.0 5 votes vote down vote up
/**
 * Infer the categories for the given record reader for
 * a particular set of columns (this is more efficient than
 * {@link #inferCategories(RecordReader, int)}
 * if you have more than one column you plan on inferring categories for)
 *
 * Note that each "column index" is a column in the context of:
 * List<Writable> record = ...;
 * record.get(columnIndex);
 *
 *
 *  Note that anything passed in as a column will be automatically converted to a
 *  string for categorical purposes. Results may vary depending on what's passed in.
 *  The *expected* input is strings or numbers (which have sensible toString() representations)
 *
 * Note that the returned categories will be sorted alphabetically, for each column
 *
 * @param recordReader the record reader to scan
 * @param columnIndices the column indices the get
 * @return the inferred categories
 */
public static Map<Integer,List<String>> inferCategories(RecordReader recordReader,int[] columnIndices) {
    if(columnIndices == null || columnIndices.length < 1) {
        return Collections.emptyMap();
    }

    Map<Integer,List<String>> categoryMap = new HashMap<>();
    Map<Integer,Set<String>> categories = new HashMap<>();
    for(int i = 0; i < columnIndices.length; i++) {
        categoryMap.put(columnIndices[i],new ArrayList<String>());
        categories.put(columnIndices[i],new HashSet<String>());
    }
    while(recordReader.hasNext()) {
        List<Writable> next = recordReader.next();
        for(int i = 0; i < columnIndices.length; i++) {
            if(columnIndices[i] >= next.size()) {
                log.warn("Filtering out example: Invalid length of columns");
                continue;
            }

            categories.get(columnIndices[i]).add(next.get(columnIndices[i]).toString());
        }

    }

    for(int i = 0; i < columnIndices.length; i++) {
        categoryMap.get(columnIndices[i]).addAll(categories.get(columnIndices[i]));

        //Sort categories alphabetically - HashSet and RecordReader orders are not deterministic in general
        Collections.sort(categoryMap.get(columnIndices[i]));
    }

    return categoryMap;
}
 
Example 13
Source File: JacksonRecordReaderTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testAppendingLabelsMetaData() throws Exception {
    ClassPathResource cpr = new ClassPathResource("datavec-api/json/");
    File f = testDir.newFolder();
    cpr.copyDirectory(f);
    String path = new File(f, "json_test_%d.txt").getAbsolutePath();

    InputSplit is = new NumberedFileInputSplit(path, 0, 2);

    //Insert at the end:
    RecordReader rr = new JacksonRecordReader(getFieldSelection(), new ObjectMapper(new JsonFactory()), false, -1,
                    new LabelGen());
    rr.initialize(is);

    List<List<Writable>> out = new ArrayList<>();
    while (rr.hasNext()) {
        out.add(rr.next());
    }
    assertEquals(3, out.size());

    rr.reset();

    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> outRecord = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    while (rr.hasNext()) {
        Record r = rr.nextRecord();
        out2.add(r.getRecord());
        outRecord.add(r);
        meta.add(r.getMetaData());
    }

    assertEquals(out, out2);

    List<Record> fromMeta = rr.loadFromMetaData(meta);
    assertEquals(outRecord, fromMeta);
}
 
Example 14
Source File: ComposableRecordReader.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public boolean hasNext() {
    boolean readersHasNext = true;
    for (RecordReader reader : readers) {
        readersHasNext = readersHasNext && reader.hasNext();
    }
    return readersHasNext;
}
 
Example 15
Source File: AnalyzeLocal.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Get a list of unique values from the specified columns.
 * For sequence data, use {@link #getUniqueSequence(List, Schema, SequenceRecordReader)}
 *
 * @param columnName    Name of the column to get unique values from
 * @param schema        Data schema
 * @param data          Data to get unique values from
 * @return              List of unique values
 */
public static Set<Writable> getUnique(String columnName, Schema schema, RecordReader data) {
    int colIdx = schema.getIndexOfColumn(columnName);
    Set<Writable> unique = new HashSet<>();
    while(data.hasNext()){
        List<Writable> next = data.next();
        unique.add(next.get(colIdx));
    }
    return unique;
}
 
Example 16
Source File: LineReaderTest.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineReaderWithInputStreamInputSplit() throws Exception {
    String tempDir = System.getProperty("java.io.tmpdir");
    File tmpdir = new File(tempDir, "tmpdir");
    tmpdir.mkdir();

    File tmp1 = new File(tmpdir, "tmp1.txt.gz");

    OutputStream os = new GZIPOutputStream(new FileOutputStream(tmp1, false));
    IOUtils.writeLines(Arrays.asList("1", "2", "3", "4", "5", "6", "7", "8", "9"), null, os);
    os.flush();
    os.close();

    InputSplit split = new InputStreamInputSplit(new GZIPInputStream(new FileInputStream(tmp1)));

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    int count = 0;
    while (reader.hasNext()) {
        assertEquals(1, reader.next().size());
        count++;
    }

    assertEquals(9, count);

    try {
        FileUtils.deleteDirectory(tmpdir);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 17
Source File: VasttextDataIterator.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
@Override
public MultiDataSet next(int num) {
	if (!hasNext())
		throw new NoSuchElementException("No next elements");

	// First: load the next values from the RR / SeqRRs
	Map<String, List<List<Writable>>> nextRRVals = new HashMap<>();
	List<RecordMetaDataComposableMap> nextMetas = (collectMetaData ? new ArrayList<RecordMetaDataComposableMap>()
			: null);

	for (Map.Entry<String, RecordReader> entry : recordReaders.entrySet()) {
		RecordReader rr = entry.getValue();
		// Standard case
			List<List<Writable>> writables = new ArrayList<>(Math.min(num, 100000)); // Min op: in case user puts
																						// batch size >> amount of
																						// data
			for (int i = 0; i < num && rr.hasNext(); i++) 
			{
				List<Writable> record;
				if (collectMetaData) {
					Record r = rr.nextRecord();
					record = r.getRecord();
					if (nextMetas.size() <= i) {
						nextMetas.add(new RecordMetaDataComposableMap(new HashMap<String, RecordMetaData>()));
					}
					RecordMetaDataComposableMap map = nextMetas.get(i);
					map.getMeta().put(entry.getKey(), r.getMetaData());
				} else {
					record = rr.next();
				}
				writables.add(record);
			}

			nextRRVals.put(entry.getKey(), writables);
	}

	return nextMultiDataSet(nextRRVals, nextMetas);
}
 
Example 18
Source File: LocalTransformProcessRecordReaderTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testLocalFilter(){

    List<List<Writable>> in = new ArrayList<>();
    in.add(Arrays.asList(new Text("Keep"), new IntWritable(0)));
    in.add(Arrays.asList(new Text("Remove"), new IntWritable(1)));
    in.add(Arrays.asList(new Text("Keep"), new IntWritable(2)));
    in.add(Arrays.asList(new Text("Remove"), new IntWritable(3)));

    Schema s = new Schema.Builder()
            .addColumnCategorical("cat", "Keep", "Remove")
            .addColumnInteger("int")
            .build();

    TransformProcess tp = new TransformProcess.Builder(s)
            .filter(new CategoricalColumnCondition("cat", ConditionOp.Equal, "Remove"))
            .build();

    RecordReader rr = new CollectionRecordReader(in);
    LocalTransformProcessRecordReader ltprr = new LocalTransformProcessRecordReader(rr, tp);

    List<List<Writable>> out = new ArrayList<>();
    while(ltprr.hasNext()){
        out.add(ltprr.next());
    }

    List<List<Writable>> exp = Arrays.asList(in.get(0), in.get(2));

    assertEquals(exp, out);

    //Check reset:
    ltprr.reset();
    out.clear();
    while(ltprr.hasNext()){
        out.add(ltprr.next());
    }
    assertEquals(exp, out);


    //Also test Record method:
    List<Record> rl = new ArrayList<>();
    rr.reset();
    while(rr.hasNext()){
        rl.add(rr.nextRecord());
    }
    List<Record> exp2 = Arrays.asList(rl.get(0), rl.get(2));

    List<Record> act = new ArrayList<>();
    ltprr.reset();
    while(ltprr.hasNext()){
        act.add(ltprr.nextRecord());
    }
}
 
Example 19
Source File: LineReaderTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testLineReaderMetaData() throws Exception {
    File tmpdir = testDir.newFolder();

    File tmp1 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp1.txt"));
    File tmp2 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp2.txt"));
    File tmp3 = new File(FilenameUtils.concat(tmpdir.getPath(), "tmp3.txt"));

    FileUtils.writeLines(tmp1, Arrays.asList("1", "2", "3"));
    FileUtils.writeLines(tmp2, Arrays.asList("4", "5", "6"));
    FileUtils.writeLines(tmp3, Arrays.asList("7", "8", "9"));

    InputSplit split = new FileSplit(tmpdir);

    RecordReader reader = new LineRecordReader();
    reader.initialize(split);

    List<List<Writable>> list = new ArrayList<>();
    while (reader.hasNext()) {
        list.add(reader.next());
    }
    assertEquals(9, list.size());


    List<List<Writable>> out2 = new ArrayList<>();
    List<Record> out3 = new ArrayList<>();
    List<RecordMetaData> meta = new ArrayList<>();
    reader.reset();
    int count = 0;
    while (reader.hasNext()) {
        Record r = reader.nextRecord();
        out2.add(r.getRecord());
        out3.add(r);
        meta.add(r.getMetaData());
        int fileIdx = count / 3;
        URI uri = r.getMetaData().getURI();
        assertEquals(uri, split.locations()[fileIdx]);
        count++;
    }

    assertEquals(list, out2);

    List<Record> fromMeta = reader.loadFromMetaData(meta);
    assertEquals(out3, fromMeta);

    //try: second line of second and third files only...
    List<RecordMetaData> subsetMeta = new ArrayList<>();
    subsetMeta.add(meta.get(4));
    subsetMeta.add(meta.get(7));
    List<Record> subset = reader.loadFromMetaData(subsetMeta);
    assertEquals(2, subset.size());
    assertEquals(out3.get(4), subset.get(0));
    assertEquals(out3.get(7), subset.get(1));
}
 
Example 20
Source File: TransformProcess.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Infer the categories for the given record reader for a particular column
 *  Note that each "column index" is a column in the context of:
 * List<Writable> record = ...;
 * record.get(columnIndex);
 *
 *  Note that anything passed in as a column will be automatically converted to a
 *  string for categorical purposes.
 *
 *  The *expected* input is strings or numbers (which have sensible toString() representations)
 *
 *  Note that the returned categories will be sorted alphabetically
 *
 * @param recordReader the record reader to iterate through
 * @param columnIndex te column index to get categories for
 * @return
 */
public static List<String> inferCategories(RecordReader recordReader,int columnIndex) {
    Set<String> categories = new HashSet<>();
    while(recordReader.hasNext()) {
        List<Writable> next = recordReader.next();
        categories.add(next.get(columnIndex).toString());
    }

    //Sort categories alphabetically - HashSet and RecordReader orders are not deterministic in general
    List<String> ret = new ArrayList<>(categories);
    Collections.sort(ret);
    return ret;
}