Java Code Examples for org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#writeWordVectors()

The following examples show how to use org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#writeWordVectors() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example 2
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
 
Example 3
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example 4
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example 5
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example 6
Source File: Word2VecRawTextExample.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
Example 7
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
Example 8
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
            .set("spark.driver.host", "localhost")
                    .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
                    .set("spark.executor.memory", "8g");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    //String dataPath = Resources.asFile("big/raw_sentences.txt").getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .useUnknown(true).build();

    word2Vec.train(corpus);


    sc.stop();

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
 
Example 9
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRunWord2Vec() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    // Strip white space before and after for each line
    /*val shakespear = new ClassPathResource("big/rnj.txt");
    SentenceIterator iter = new BasicLineIterator(shakespear.getFile());*/
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
                    //.negativeSample(10)
                    .epochs(1).windowSize(5).allowParallelTokenization(true)
                    .workers(6)
                    .usePreciseMode(true)
                    .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    File tempFile = File.createTempFile("temp", "temp");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
    Collection<String> lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(sim < 1.0);
    assertTrue(sim > 0.4);


    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    assertFalse(lst.contains(null));


    lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    new File("cache.ser").delete();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));

    WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
 
Example 10
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
 
Example 11
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testConcepts() throws Exception {
    // These are all default values for word2vec
    SparkConf sparkConf = new SparkConf().setMaster("local[8]")
            .set("spark.driver.host", "localhost")
            .setAppName("sparktest");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    String dataPath = new ClassPathResource("big/raw_sentences.txt").getFile().getAbsolutePath();
    //        dataPath = "/ext/Temp/part-00000";
    //        String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .stopWords(Arrays.asList("three")).useUnknown(true).build();

    word2Vec.train(corpus);

    //word2Vec.setModelUtils(new FlatModelUtils());

    System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK"));

    InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable();

    double sim = word2Vec.similarity("day", "night");
    System.out.println("day/night similarity: " + sim);
    /*
    System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce"));
    System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro"));
    
    Collection<String> portu = word2Vec.wordsNearest("carro", 10);
    printWords("carro", portu, word2Vec);
    
    portu = word2Vec.wordsNearest("davi", 10);
    printWords("davi", portu, word2Vec);
    
    System.out.println("---------------------------------------");
    */

    Collection<String> words = word2Vec.wordsNearest("day", 10);
    printWords("day", words, word2Vec);

    assertTrue(words.contains("night"));
    assertTrue(words.contains("week"));
    assertTrue(words.contains("year"));

    sim = word2Vec.similarity("two", "four");
    System.out.println("two/four similarity: " + sim);

    words = word2Vec.wordsNearest("two", 10);
    printWords("two", words, word2Vec);

    // three should be absent due to stopWords
    assertFalse(words.contains("three"));

    assertTrue(words.contains("five"));
    assertTrue(words.contains("four"));

    sc.stop();


    // test serialization
    File tempFile = testDir.newFile("temp" + System.currentTimeMillis() + ".tmp");

    int idx1 = word2Vec.vocab().wordFor("day").getIndex();

    INDArray array1 = word2Vec.getWordVectorMatrix("day").dup();

    VocabWord word1 = word2Vec.vocab().elementAtIndex(0);

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile);

    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0);
    VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it");
    int idx2 = vectors.vocab().wordFor("day").getIndex();

    INDArray array2 = vectors.getWordVectorMatrix("day").dup();

    System.out.println("word 'i': " + word2);
    System.out.println("word 'it': " + wordIT);

    assertEquals(idx1, idx2);
    assertEquals(word1, word2);
    assertEquals(array1, array2);
}
 
Example 12
Source File: UITest.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
@Test
public void testPosting() throws Exception {

    //        File inputFile = Resources.asFile("big/raw_sentences.txt");
    File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).epochs(1).layerSize(20)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);

    UIServer.getInstance(); //Initialize

    UiConnectionInfo uiConnectionInfo =
                    new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build();

    BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2)
                    .setMaxIter(10).build();

    vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo);


    Thread.sleep(100000);
}