org.deeplearning4j.text.documentiterator.FileLabelAwareIterator Java Examples

The following examples show how to use org.deeplearning4j.text.documentiterator.FileLabelAwareIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Word2VecTestsSmall.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testLabelAwareIterator_1() throws Exception {
    val resource = new ClassPathResource("/labeled");
    val file = resource.getFile();

    val iter = (LabelAwareIterator) new FileLabelAwareIterator.Builder().addSourceFolder(file).build();

    val t = new DefaultTokenizerFactory();

    val w2v = new Word2Vec.Builder()
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    // we hope nothing is going to happen here
}
 
Example #2
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * This test is not indicative.
 * there's no need in this test within travis, use it manually only for problems detection
 *
 * @throws Exception
 */
@Test
@Ignore
public void testParagraphVectorsReducedLabels1() throws Exception {
    val tempDir = testDir.newFolder();
    ClassPathResource resource = new ClassPathResource("/labeled");
    resource.copyDirectory(tempDir);

    LabelAwareIterator iter = new FileLabelAwareIterator.Builder().addSourceFolder(tempDir).build();

    TokenizerFactory t = new DefaultTokenizerFactory();

    /**
     * Please note: text corpus is REALLY small, and some kind of "results" could be received with HIGH epochs number, like 30.
     * But there's no reason to keep at that high
     */

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).epochs(3).layerSize(100)
                    .stopWords(new ArrayList<String>()).windowSize(5).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    //WordVectorSerializer.writeWordVectors(vec, "vectors.txt");

    INDArray w1 = vec.lookupTable().vector("I");
    INDArray w2 = vec.lookupTable().vector("am");
    INDArray w3 = vec.lookupTable().vector("sad.");

    INDArray words = Nd4j.create(3, vec.lookupTable().layerSize());

    words.putRow(0, w1);
    words.putRow(1, w2);
    words.putRow(2, w3);


    INDArray mean = words.isMatrix() ? words.mean(0) : words;

    log.info("Mean" + Arrays.toString(mean.dup().data().asDouble()));
    log.info("Array" + Arrays.toString(vec.lookupTable().vector("negative").dup().data().asDouble()));

    double simN = Transforms.cosineSim(mean, vec.lookupTable().vector("negative"));
    log.info("Similarity negative: " + simN);


    double simP = Transforms.cosineSim(mean, vec.lookupTable().vector("neutral"));
    log.info("Similarity neutral: " + simP);

    double simV = Transforms.cosineSim(mean, vec.lookupTable().vector("positive"));
    log.info("Similarity positive: " + simV);
}
 
Example #3
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testIterator() throws IOException {
    val folder_labeled = testDir.newFolder();
    val folder_unlabeled = testDir.newFolder();
    new ClassPathResource("/paravec/labeled/").copyDirectory(folder_labeled);
    new ClassPathResource("/paravec/unlabeled/").copyDirectory(folder_unlabeled);


    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
            .addSourceFolder(folder_labeled).build();

    File resource_sentences = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(resource_sentences);

    int i = 0;
    for (; i < 10; ++i) {
        int j = 0;
        int labels = 0;
        int words = 0;
        while (labelAwareIterator.hasNextDocument()) {
            ++j;
            LabelledDocument document = labelAwareIterator.nextDocument();
            labels += document.getLabels().size();
            List<VocabWord> lst =  document.getReferencedContent();
            if (!CollectionUtils.isEmpty(lst))
                words += lst.size();
        }
        labelAwareIterator.reset();
        //System.out.println(words + " " + labels + " " + j);
        assertEquals(0, words);
        assertEquals(30, labels);
        assertEquals(30, j);
        j = 0;
        while (iter.hasNext()) {
            ++j;
            iter.nextSentence();
        }
        assertEquals(97162, j);
        iter.reset();
    }

}
 
Example #4
Source File: InMemoryLookupTableTest.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
@Test(timeout = 300000)
public void testConsumeOnNonEqualVocabs() throws Exception {
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();


    File resource = Resources.asFile("big/raw_sentences.txt");

    BasicLineIterator underlyingIterator = new BasicLineIterator(resource);


    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();

    vocabConstructor.buildJointVocabulary(false, true);

    assertEquals(244, cacheSource.numWords());

    InMemoryLookupTable<VocabWord> mem1 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheSource).build();

    mem1.resetWeights(true);



    AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();


    val dir = testDir.newFolder();
    new ClassPathResource("/paravec/labeled/").copyDirectory(dir);

    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
                    .addSourceFolder(dir).build();

    transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();

    sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();

    vocabTransfer.buildMergedVocabulary(cacheSource, true);

    // those +3 go for 3 additional entries in target VocabCache: labels
    assertEquals(cacheSource.numWords() + 3, cacheTarget.numWords());


    InMemoryLookupTable<VocabWord> mem2 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheTarget).seed(18).build();

    mem2.resetWeights(true);

    assertNotEquals(mem1.vector("day"), mem2.vector("day"));

    mem2.consume(mem1);

    assertEquals(mem1.vector("day"), mem2.vector("day"));

    assertTrue(mem1.syn0.rows() < mem2.syn0.rows());

    assertEquals(mem1.syn0.rows() + 3, mem2.syn0.rows());
}
 
Example #5
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
@Test
public void testMergedVocabWithLabels1() throws Exception {
    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();

    AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();

    File resource = Resources.asFile("big/raw_sentences.txt");

    BasicLineIterator underlyingIterator = new BasicLineIterator(resource);


    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();

    vocabConstructor.buildJointVocabulary(false, true);

    int sourceSize = cacheSource.numWords();
    log.info("Source Vocab size: " + sourceSize);

    val dir = testDir.newFolder();
    new ClassPathResource("/paravec/labeled/").copyDirectory(dir);


    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
                    .addSourceFolder(dir).build();

    transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();

    sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();

    vocabTransfer.buildMergedVocabulary(cacheSource, true);

    // those +3 go for 3 additional entries in target VocabCache: labels
    assertEquals(sourceSize + 3, cacheTarget.numWords());

    // now we check index equality for transferred elements
    assertEquals(cacheSource.wordAtIndex(17), cacheTarget.wordAtIndex(17));
    assertEquals(cacheSource.wordAtIndex(45), cacheTarget.wordAtIndex(45));
    assertEquals(cacheSource.wordAtIndex(89), cacheTarget.wordAtIndex(89));

    // we check that newly added labels have indexes beyond the VocabCache index space
    // please note, we need >= since the indexes are zero-based, and sourceSize is not
    assertTrue(cacheTarget.indexOf("Zfinance") > sourceSize - 1);
    assertTrue(cacheTarget.indexOf("Zscience") > sourceSize - 1);
    assertTrue(cacheTarget.indexOf("Zhealth") > sourceSize - 1);
}