org.deeplearning4j.text.documentiterator.LabelledDocument Java Examples

The following examples show how to use org.deeplearning4j.text.documentiterator.LabelledDocument. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasicTransformerIterator.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public Sequence<VocabWord> next() {
    LabelledDocument document = iterator.nextDocument();
    if (document == null || document.getContent() == null)
        return new Sequence<>();
    Sequence<VocabWord> sequence = sentenceTransformer.transformToSequence(document.getContent());

    if (document.getLabels() != null)
        for (String label : document.getLabels()) {
            if (label != null && !label.isEmpty())
                sequence.addSequenceLabel(new VocabWord(1.0, label));
        }
    /*
    if (document.getLabel() != null && !document.getLabel().isEmpty()) {
        sequence.setSequenceLabel(new VocabWord(1.0, document.getLabel()));
    }*/

    return sequence;
}
 
Example #2
Source File: LabelAwareConverter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public Pair<String, String> nextSentence() {
    LabelledDocument document = backingIterator.nextDocument();

    // TODO: probably worth to allow more then one label? i.e. pass same document twice, sequentially
    return Pair.makePair(document.getContent(), document.getLabels().get(0));
}
 
Example #3
Source File: TfidfVectorizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public void testTfIdfVectorizerFromLabelAwareIterator() throws Exception {
    LabelledDocument doc1 = new LabelledDocument();
    doc1.addLabel("dog");
    doc1.setContent("it barks like a dog");

    LabelledDocument doc2 = new LabelledDocument();
    doc2.addLabel("cat");
    doc2.setContent("it meows like a cat");

    List<LabelledDocument> docs = new ArrayList<>(2);
    docs.add(doc1);
    docs.add(doc2);
    
    LabelAwareIterator iterator = new SimpleLabelAwareIterator(docs);
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();

    TfidfVectorizer vectorizer = new TfidfVectorizer
        .Builder()
        .setMinWordFrequency(1)
        .setStopWords(new ArrayList<String>())
        .setTokenizerFactory(tokenizerFactory)
        .setIterator(iterator)
        .allowParallelTokenization(false)
        .build();

    vectorizer.fit();

    DataSet dataset = vectorizer.vectorize("it meows like a cat", "cat");
    assertNotNull(dataset);
    
    LabelsSource source = vectorizer.getLabelsSource();
    assertEquals(2, source.getNumberOfLabelsUsed());
    List<String> labels = source.getLabels();
    assertEquals("dog", labels.get(0));
    assertEquals("cat", labels.get(1));
}
 
Example #4
Source File: DocumentSequenceConvertFunction.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public Sequence<VocabWord> call(LabelledDocument document) throws Exception {
    Sequence<VocabWord> sequence = new Sequence<>();

    // get elements
    if (document.getReferencedContent() != null && !document.getReferencedContent().isEmpty()) {
        sequence.addElements(document.getReferencedContent());
    } else {
        if (tokenizerFactory == null)
            instantiateTokenizerFactory();

        List<String> tokens = tokenizerFactory.create(document.getContent()).getTokens();

        for (String token : tokens) {
            if (token == null || token.isEmpty())
                continue;

            VocabWord word = new VocabWord(1.0, token);
            sequence.addElement(word);
        }
    }

    // get labels
    for (String label : document.getLabels()) {
        if (label == null || label.isEmpty())
            continue;

        VocabWord labelElement = new VocabWord(1.0, label);
        labelElement.markAsLabel(true);

        sequence.addSequenceLabel(labelElement);
    }

    return sequence;
}
 
Example #5
Source File: ParagraphVectorsClassifierExample.java    From Java-for-Data-Science with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {

        ClassPathResource resource = new ClassPathResource("paravec/labeled");

        iter = new FileLabelAwareIterator.Builder()
                .addSourceFolder(resource.getFile())
                .build();

        tFact = new DefaultTokenizerFactory();
        tFact.setTokenPreProcessor(new CommonPreprocessor());

        pVect = new ParagraphVectors.Builder()
                .learningRate(0.025)
                .minLearningRate(0.001)
                .batchSize(1000)
                .epochs(20)
                .iterate(iter)
                .trainWordVectors(true)
                .tokenizerFactory(tFact)
                .build();

        pVect.fit();


        ClassPathResource unlabeledText = new ClassPathResource("paravec/unlabeled");
        FileLabelAwareIterator unlabeledIter = new FileLabelAwareIterator.Builder()
                .addSourceFolder(unlabeledText.getFile())
                .build();


        MeansBuilder mBuilder = new MeansBuilder(
                (InMemoryLookupTable<VocabWord>) pVect.getLookupTable(),
                tFact);
        LabelSeeker lSeeker = new LabelSeeker(iter.getLabelsSource().getLabels(),
                (InMemoryLookupTable<VocabWord>) pVect.getLookupTable());

        while (unlabeledIter.hasNextDocument()) {
            LabelledDocument doc = unlabeledIter.nextDocument();
            INDArray docCentroid = mBuilder.documentAsVector(doc);
            List<Pair<String, Double>> scores = lSeeker.getScores(docCentroid);

            out.println("Document '" + doc.getLabel() + "' falls into the following categories: ");
            for (Pair<String, Double> score : scores) {
                out.println("        " + score.getFirst() + ": " + score.getSecond());
            }

        }
    }
 
Example #6
Source File: ParallelTransformerIterator.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public CallableTransformer(LabelledDocument document, SentenceTransformer transformer) {
    this.transformer = transformer;
    this.document = document;
}
 
Example #7
Source File: SentenceIteratorConverter.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public LabelledDocument next() {
    return nextDocument();
}
 
Example #8
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testIterator() throws IOException {
    val folder_labeled = testDir.newFolder();
    val folder_unlabeled = testDir.newFolder();
    new ClassPathResource("/paravec/labeled/").copyDirectory(folder_labeled);
    new ClassPathResource("/paravec/unlabeled/").copyDirectory(folder_unlabeled);


    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
            .addSourceFolder(folder_labeled).build();

    File resource_sentences = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(resource_sentences);

    int i = 0;
    for (; i < 10; ++i) {
        int j = 0;
        int labels = 0;
        int words = 0;
        while (labelAwareIterator.hasNextDocument()) {
            ++j;
            LabelledDocument document = labelAwareIterator.nextDocument();
            labels += document.getLabels().size();
            List<VocabWord> lst =  document.getReferencedContent();
            if (!CollectionUtils.isEmpty(lst))
                words += lst.size();
        }
        labelAwareIterator.reset();
        //System.out.println(words + " " + labels + " " + j);
        assertEquals(0, words);
        assertEquals(30, labels);
        assertEquals(30, j);
        j = 0;
        while (iter.hasNext()) {
            ++j;
            iter.nextSentence();
        }
        assertEquals(97162, j);
        iter.reset();
    }

}
 
Example #9
Source File: SparkParagraphVectors.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * This method builds ParagraphVectors model, expecting JavaRDD<LabelledDocument>.
 * It can be either non-tokenized documents, or tokenized.
 *
 * @param documentsRdd
 */
public void fitLabelledDocuments(JavaRDD<LabelledDocument> documentsRdd) {

    validateConfiguration();

    broadcastEnvironment(new JavaSparkContext(documentsRdd.context()));

    JavaRDD<Sequence<VocabWord>> sequenceRDD =
                    documentsRdd.map(new DocumentSequenceConvertFunction(configurationBroadcast));

    super.fitSequences(sequenceRDD);
}