org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory Java Examples

The following examples show how to use org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Word2VecTestsSmall.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testLabelAwareIterator_1() throws Exception {
    val resource = new ClassPathResource("/labeled");
    val file = resource.getFile();

    val iter = (LabelAwareIterator) new FileLabelAwareIterator.Builder().addSourceFolder(file).build();

    val t = new DefaultTokenizerFactory();

    val w2v = new Word2Vec.Builder()
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    // we hope nothing is going to happen here
}
 
Example #2
Source File: SparkWord2VecTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore("AB 2019/05/21 - Failing - Issue #7657")
public void testStringsTokenization1() throws Exception {
    JavaRDD<String> rddSentences = sc.parallelize(sentences);

    SparkWord2Vec word2Vec = new SparkWord2Vec();
    word2Vec.getConfiguration().setTokenizerFactory(DefaultTokenizerFactory.class.getCanonicalName());
    word2Vec.getConfiguration().setElementsLearningAlgorithm("org.deeplearning4j.spark.models.sequencevectors.learning.elements.SparkSkipGram");
    word2Vec.setExporter(new SparkModelExporter<VocabWord>() {
        @Override
        public void export(JavaRDD<ExportContainer<VocabWord>> rdd) {
            rdd.foreach(new TestFn());
        }
    });


    word2Vec.fitSentences(rddSentences);

    VocabCache<ShallowSequenceElement> vocabCache = word2Vec.getShallowVocabCache();

    assertNotEquals(null, vocabCache);

    assertEquals(9, vocabCache.numWords());
    assertEquals(2.0, vocabCache.wordFor(SequenceElement.getLongHash("one")).getElementFrequency(), 1e-5);
    assertEquals(1.0, vocabCache.wordFor(SequenceElement.getLongHash("two")).getElementFrequency(), 1e-5);
}
 
Example #3
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultStreamTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    assertEquals(5, tokenizer2.countTokens());

    int cnt = 0;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer2.nextToken();
        log.info(tok1);
        cnt++;
    }

    assertEquals(5, cnt);
}
 
Example #4
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #5
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }


    System.out.println("-----------------------------------------------");

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();

    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
                    + Math.abs(stringCount - stringCount2));

    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example #6
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testParallelIterator() throws IOException {
    TokenizerFactory factory = new DefaultTokenizerFactory();
    SentenceIterator iterator = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    BasicTransformerIterator iter = (BasicTransformerIterator)transformer.iterator();
    for (int i = 0; i < 100; ++i) {
        int cnt = 0;
        long counter = 0;
        Sequence<VocabWord> sequence = null;
        while (iter.hasNext()) {
            sequence = iter.next();
            counter += sequence.size();
            cnt++;
        }
        iter.reset();
        assertEquals(757172, counter);
    }
}
 
Example #7
Source File: DefaultDocumentIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDocumentIterator() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();

    DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());

    InputStream doc = iter.nextDocument();

    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer next = t.create(doc);
    String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
    ///PEARSON CONCENTRATES ON FOUR SECTORS
    int count = 0;
    while (next.hasMoreTokens() && count < list.length) {
        String token = next.nextToken();
        assertEquals(list[count++], token);
    }


    doc.close();
}
 
Example #8
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example #9
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example #10
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer1() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }


    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example #11
Source File: TfidfVectorizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public void testTfIdfVectorizerFromLabelAwareIterator() throws Exception {
    LabelledDocument doc1 = new LabelledDocument();
    doc1.addLabel("dog");
    doc1.setContent("it barks like a dog");

    LabelledDocument doc2 = new LabelledDocument();
    doc2.addLabel("cat");
    doc2.setContent("it meows like a cat");

    List<LabelledDocument> docs = new ArrayList<>(2);
    docs.add(doc1);
    docs.add(doc2);
    
    LabelAwareIterator iterator = new SimpleLabelAwareIterator(docs);
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();

    TfidfVectorizer vectorizer = new TfidfVectorizer
        .Builder()
        .setMinWordFrequency(1)
        .setStopWords(new ArrayList<String>())
        .setTokenizerFactory(tokenizerFactory)
        .setIterator(iterator)
        .allowParallelTokenization(false)
        .build();

    vectorizer.fit();

    DataSet dataset = vectorizer.vectorize("it meows like a cat", "cat");
    assertNotNull(dataset);
    
    LabelsSource source = vectorizer.getLabelsSource();
    assertEquals(2, source.getNumberOfLabelsUsed());
    List<String> labels = source.getLabels();
    assertEquals("dog", labels.get(0));
    assertEquals("cat", labels.get(1));
}
 
Example #12
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
Example #13
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultTokenizer3() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }
}
 
Example #14
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677
public void testDirectInference() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator sentencesIter = getIterator(isIntegration, resource);

    ClassPathResource resource_mixed = new ClassPathResource("paravec/");
    File local_resource_mixed = testDir.newFolder();
    resource_mixed.copyDirectory(local_resource_mixed);
    SentenceIterator iter = new AggregatingSentenceIterator.Builder()
                    .addSentenceIterator(sentencesIter)
                    .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1)
                    .learningRate(0.025).layerSize(150).minLearningRate(0.001)
                    .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    wordVectors.fit();

    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10)
                    .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors)
                    .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #15
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 300000)
public void testDoubleFit() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = getIterator(isIntegration, resource);


    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    val builder = new ParagraphVectors.Builder();
    ParagraphVectors vec = builder.minWordFrequency(1).iterations(5).seed(119).epochs(1)
            .layerSize(150).learningRate(0.025).labelsSource(source).windowSize(5)
            .sequenceLearningAlgorithm(new DM<VocabWord>()).iterate(iter).trainWordVectors(true)
            .usePreciseWeightInit(true)
            .batchSize(8192)
            .allowParallelTokenization(false)
            .tokenizerFactory(t).workers(1).sampling(0).build();

    vec.fit();
    long num1 = vec.vocab().totalNumberOfDocs();

    vec.fit();
    System.out.println(vec.vocab().totalNumberOfDocs());
    long num2 = vec.vocab().totalNumberOfDocs();

    assertEquals(num1, num2);
}
 
Example #16
Source File: Word2VecTestsSmall.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 300000)
    public void testUnkSerialization_1() throws Exception {
        val inputFile = Resources.asFile("big/raw_sentences.txt");
//        val iter = new BasicLineIterator(inputFile);
        val iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
        val t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        val vec = new Word2Vec.Builder()
                .minWordFrequency(1)
                .epochs(1)
                .layerSize(300)
                .limitVocabularySize(1) // Limit the vocab size to 2 words
                .windowSize(5)
                .allowParallelTokenization(true)
                .batchSize(512)
                .learningRate(0.025)
                .minLearningRate(0.0001)
                .negativeSample(0.0)
                .sampling(0.0)
                .useAdaGrad(false)
                .useHierarchicSoftmax(true)
                .iterations(1)
                .useUnknown(true) // Using UNK with limited vocab size causes the issue
                .seed(42)
                .iterate(iter)
                .workers(4)
                .tokenizerFactory(t).build();

        vec.fit();

        val tmpFile = File.createTempFile("temp","temp");
        tmpFile.deleteOnExit();

        WordVectorSerializer.writeWord2VecModel(vec, tmpFile); // NullPointerException was thrown here
    }
 
Example #17
Source File: Word2VecDataSetIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }
 
Example #18
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordsNearestSum() throws IOException {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("Load & Vectorize Sentences....");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder()
            .minWordFrequency(5)
            .iterations(1)
            .layerSize(100)
            .seed(42)
            .windowSize(5)
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("Writing word vectors to text file....");
    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearestSum("day", 10);
    log.info("10 Words closest to 'day': {}", lst);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(lst.contains("years"));
    assertTrue(lst.contains("time"));
}
 
Example #19
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels_WithUnknown() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            //.negativeSample(10)
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(4)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t)
            .useUnknown(true).unknownElement(new VocabWord(1, "UNKOWN")).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("bus");
    labels.add("car");

    INDArray matrix = vec.getWordVectors(labels);
    for (int i = 0; i < labels.size(); ++i)
        assertEquals(matrix.getRow(i, true), vec.getWordVectorMatrix("UNKNOWN"));
}
 
Example #20
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");

    INDArray matrix = vec.getWordVectors(labels);
    assertTrue(matrix.isEmpty());
}
 
Example #21
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsPartiallyAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(3, matrix.rows());
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));
}
 
Example #22
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
            .set("spark.driver.host", "localhost")
                    .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
                    .set("spark.executor.memory", "8g");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    //String dataPath = Resources.asFile("big/raw_sentences.txt").getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .useUnknown(true).build();

    word2Vec.train(corpus);


    sc.stop();

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
 
Example #23
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWord2VecCBOW() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(4)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
 
Example #24
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore // no adagrad these days
public void testWord2VecAdaGrad() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(5).learningRate(0.025).layerSize(100)
                    .seed(42).batchSize(13500).sampling(0).negativeSample(0)
                    //.epochs(10)
                    .windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false)
                    .useHierarchicSoftmax(true).iterate(iter).workers(4).tokenizerFactory(t).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
}
 
Example #25
Source File: Word2VecRawTextExample.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
Example #26
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example #27
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example #28
Source File: BagOfWordsVectorizerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 60000L)
public void testBagOfWordsVectorizer() throws Exception {
    val rootDir = testDir.newFolder();
    ClassPathResource resource = new ClassPathResource("rootdir/");
    resource.copyDirectory(rootDir);

    LabelAwareSentenceIterator iter = new LabelAwareFileSentenceIterator(rootDir);
    List<String> labels = Arrays.asList("label1", "label2");
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();

    BagOfWordsVectorizer vectorizer = new BagOfWordsVectorizer.Builder().setMinWordFrequency(1)
                    .setStopWords(new ArrayList<String>()).setTokenizerFactory(tokenizerFactory).setIterator(iter)
                    .allowParallelTokenization(false)
                    //                .labels(labels)
                    //                .cleanup(true)
                    .build();

    vectorizer.fit();
    VocabWord word = vectorizer.getVocabCache().wordFor("file.");
    assumeNotNull(word);
    assertEquals(word, vectorizer.getVocabCache().tokenFor("file."));
    assertEquals(2, vectorizer.getVocabCache().totalNumberOfDocs());

    assertEquals(2, word.getSequencesCount());
    assertEquals(2, word.getElementFrequency(), 0.1);

    VocabWord word1 = vectorizer.getVocabCache().wordFor("1");

    assertEquals(1, word1.getSequencesCount());
    assertEquals(1, word1.getElementFrequency(), 0.1);

    log.info("Labels used: " + vectorizer.getLabelsSource().getLabels());
    assertEquals(2, vectorizer.getLabelsSource().getNumberOfLabelsUsed());

    ///////////////////
    INDArray array = vectorizer.transform("This is 2 file.");
    log.info("Transformed array: " + array);
    assertEquals(5, array.columns());


    VocabCache<VocabWord> vocabCache = vectorizer.getVocabCache();

    assertEquals(2, array.getDouble(vocabCache.tokenFor("This").getIndex()), 0.1);
    assertEquals(2, array.getDouble(vocabCache.tokenFor("is").getIndex()), 0.1);
    assertEquals(2, array.getDouble(vocabCache.tokenFor("file.").getIndex()), 0.1);
    assertEquals(0, array.getDouble(vocabCache.tokenFor("1").getIndex()), 0.1);
    assertEquals(1, array.getDouble(vocabCache.tokenFor("2").getIndex()), 0.1);

    DataSet dataSet = vectorizer.vectorize("This is 2 file.", "label2");
    assertEquals(array, dataSet.getFeatures());

    INDArray labelz = dataSet.getLabels();
    log.info("Labels array: " + labelz);

    int idx2 = Nd4j.getExecutioner().exec(new ArgMax(labelz))[0].getInt(0);
    //int idx2 = ((IndexAccumulation) Nd4j.getExecutioner().exec(new IMax(labelz))).getFinalResult().intValue();

    //        assertEquals(1.0, dataSet.getLabels().getDouble(0), 0.1);
    //        assertEquals(0.0, dataSet.getLabels().getDouble(1), 0.1);

    dataSet = vectorizer.vectorize("This is 1 file.", "label1");

    assertEquals(2, dataSet.getFeatures().getDouble(vocabCache.tokenFor("This").getIndex()), 0.1);
    assertEquals(2, dataSet.getFeatures().getDouble(vocabCache.tokenFor("is").getIndex()), 0.1);
    assertEquals(2, dataSet.getFeatures().getDouble(vocabCache.tokenFor("file.").getIndex()), 0.1);
    assertEquals(1, dataSet.getFeatures().getDouble(vocabCache.tokenFor("1").getIndex()), 0.1);
    assertEquals(0, dataSet.getFeatures().getDouble(vocabCache.tokenFor("2").getIndex()), 0.1);

    int idx1 = Nd4j.getExecutioner().exec(new ArgMax(dataSet.getLabels()))[0].getInt(0);
    //int idx1 = ((IndexAccumulation) Nd4j.getExecutioner().exec(new IMax(dataSet.getLabels()))).getFinalResult().intValue();

    //assertEquals(0.0, dataSet.getLabels().getDouble(0), 0.1);
    //assertEquals(1.0, dataSet.getLabels().getDouble(1), 0.1);

    assertNotEquals(idx2, idx1);

    // Serialization check
    File tempFile = createTempFile("fdsf", "fdfsdf");
    tempFile.deleteOnExit();

    SerializationUtils.saveObject(vectorizer, tempFile);

    BagOfWordsVectorizer vectorizer2 = SerializationUtils.readObject(tempFile);
    vectorizer2.setTokenizerFactory(tokenizerFactory);

    dataSet = vectorizer2.vectorize("This is 2 file.", "label2");
    assertEquals(array, dataSet.getFeatures());
}
 
Example #29
Source File: SparkSequenceVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testFrequenciesCount() throws Exception {
    JavaRDD<Sequence<VocabWord>> sequences = sc.parallelize(sequencesCyclic);

    SparkSequenceVectors<VocabWord> seqVec = new SparkSequenceVectors<>();

    seqVec.getConfiguration().setTokenizerFactory(DefaultTokenizerFactory.class.getCanonicalName());
    seqVec.getConfiguration().setElementsLearningAlgorithm("org.deeplearning4j.spark.models.sequencevectors.learning.elements.SparkSkipGram");
    seqVec.setExporter(new SparkModelExporter<VocabWord>() {
        @Override
        public void export(JavaRDD<ExportContainer<VocabWord>> rdd) {
            rdd.foreach(new SparkWord2VecTest.TestFn());
        }
    });

    seqVec.fitSequences(sequences);

    Counter<Long> counter = seqVec.getCounter();

    // element "0" should have frequency of 20
    assertEquals(20, counter.getCount(0L), 1e-5);

    // elements 1 - 9 should have frequencies of 10
    for (int e = 1; e < sequencesCyclic.get(0).getElements().size() - 1; e++) {
        assertEquals(10, counter.getCount(sequencesCyclic.get(0).getElementByIndex(e).getStorageId()), 1e-5);
    }


    VocabCache<ShallowSequenceElement> shallowVocab = seqVec.getShallowVocabCache();

    assertEquals(10, shallowVocab.numWords());

    ShallowSequenceElement zero = shallowVocab.tokenFor(0L);
    ShallowSequenceElement first = shallowVocab.tokenFor(1L);

    assertNotEquals(null, zero);
    assertEquals(20.0, zero.getElementFrequency(), 1e-5);
    assertEquals(0, zero.getIndex());

    assertEquals(10.0, first.getElementFrequency(), 1e-5);
}
 
Example #30
Source File: TfidfVectorizerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 60000L)
public void testTfIdfVectorizer() throws Exception {
    val rootDir = testDir.newFolder();
    ClassPathResource resource = new ClassPathResource("tripledir/");
    resource.copyDirectory(rootDir);
    
    assertTrue(rootDir.isDirectory());

    LabelAwareSentenceIterator iter = new LabelAwareFileSentenceIterator(rootDir);
    TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();

    TfidfVectorizer vectorizer = new TfidfVectorizer.Builder().setMinWordFrequency(1)
                    .setStopWords(new ArrayList<String>()).setTokenizerFactory(tokenizerFactory).setIterator(iter)
                    .allowParallelTokenization(false)
                    //                .labels(labels)
                    //                .cleanup(true)
                    .build();

    vectorizer.fit();
    VocabWord word = vectorizer.getVocabCache().wordFor("file.");
    assumeNotNull(word);
    assertEquals(word, vectorizer.getVocabCache().tokenFor("file."));
    assertEquals(3, vectorizer.getVocabCache().totalNumberOfDocs());

    assertEquals(3, word.getSequencesCount());
    assertEquals(3, word.getElementFrequency(), 0.1);

    VocabWord word1 = vectorizer.getVocabCache().wordFor("1");

    assertEquals(1, word1.getSequencesCount());
    assertEquals(1, word1.getElementFrequency(), 0.1);

    log.info("Labels used: " + vectorizer.getLabelsSource().getLabels());
    assertEquals(3, vectorizer.getLabelsSource().getNumberOfLabelsUsed());

    assertEquals(3, vectorizer.getVocabCache().totalNumberOfDocs());

    assertEquals(11, vectorizer.numWordsEncountered());

    INDArray vector = vectorizer.transform("This is 3 file.");
    log.info("TF-IDF vector: " + Arrays.toString(vector.data().asDouble()));

    VocabCache<VocabWord> vocabCache = vectorizer.getVocabCache();

    assertEquals(.04402, vector.getDouble(vocabCache.tokenFor("This").getIndex()), 0.001);
    assertEquals(.04402, vector.getDouble(vocabCache.tokenFor("is").getIndex()), 0.001);
    assertEquals(0.119, vector.getDouble(vocabCache.tokenFor("3").getIndex()), 0.001);
    assertEquals(0, vector.getDouble(vocabCache.tokenFor("file.").getIndex()), 0.001);



    DataSet dataSet = vectorizer.vectorize("This is 3 file.", "label3");
    //assertEquals(0.0, dataSet.getLabels().getDouble(0), 0.1);
    //assertEquals(0.0, dataSet.getLabels().getDouble(1), 0.1);
    //assertEquals(1.0, dataSet.getLabels().getDouble(2), 0.1);
    int cnt = 0;
    for (int i = 0; i < 3; i++) {
        if (dataSet.getLabels().getDouble(i) > 0.1)
            cnt++;
    }

    assertEquals(1, cnt);


    File tempFile = testDir.newFile("somefile.bin");
    tempFile.delete();

    SerializationUtils.saveObject(vectorizer, tempFile);

    TfidfVectorizer vectorizer2 = SerializationUtils.readObject(tempFile);
    vectorizer2.setTokenizerFactory(tokenizerFactory);

    dataSet = vectorizer2.vectorize("This is 3 file.", "label2");
    assertEquals(vector, dataSet.getFeatures());
}