Java Code Examples for org.deeplearning4j.models.word2vec.Word2Vec#fit()

The following examples show how to use org.deeplearning4j.models.word2vec.Word2Vec#fit() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: Word2VecCN.java From word2vec with Apache License 2.0

6 votes

public Word2Vec fit() {
  log.info("Building model....");
  Word2Vec vec =
      new Word2Vec.Builder()
          .minWordFrequency(minWordFrequency)
          .iterations(iterations)
          .layerSize(layerSize)
          .seed(seed)
          .windowSize(windowSize)
          .iterate(sentenceIterator)
          .tokenizerFactory(tokenizerFactory)
          .build();

  log.info("Fitting Word2Vec model....");
  vec.fit();
  return vec;
}

Example 2

Source File: PerformanceTests.java From deeplearning4j with Apache License 2.0

6 votes

@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}

Example 3

Source File: ChineseTokenizerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}

Example 4

Source File: ManualTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}

Example 5

Source File: Word2VecModelExample.java From Java-Deep-Learning-Cookbook with MIT License

5 votes

public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}

Example 6

Source File: Word2VecModelExample.java From Java-Deep-Learning-Cookbook with MIT License

5 votes

public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}

Example 7

Source File: Word2VecRawTextExample.java From Java-Data-Science-Cookbook with MIT License

5 votes

public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }

Example 8

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}

Example 9

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677
public void testDirectInference() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator sentencesIter = getIterator(isIntegration, resource);

    ClassPathResource resource_mixed = new ClassPathResource("paravec/");
    File local_resource_mixed = testDir.newFolder();
    resource_mixed.copyDirectory(local_resource_mixed);
    SentenceIterator iter = new AggregatingSentenceIterator.Builder()
                    .addSentenceIterator(sentencesIter)
                    .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1)
                    .learningRate(0.025).layerSize(150).minLearningRate(0.001)
                    .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    wordVectors.fit();

    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10)
                    .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors)
                    .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}

Example 10

Source File: Word2VecDataSetIteratorTest.java From deeplearning4j with Apache License 2.0

5 votes

/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }

Example 11

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}

Example 12

Source File: UITest.java From deeplearning4j with Apache License 2.0

3 votes

@Test
public void testPosting() throws Exception {

    //        File inputFile = Resources.asFile("big/raw_sentences.txt");
    File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).epochs(1).layerSize(20)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);

    UIServer.getInstance(); //Initialize

    UiConnectionInfo uiConnectionInfo =
                    new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build();

    BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2)
                    .setMaxIter(10).build();

    vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo);


    Thread.sleep(100000);
}