org.deeplearning4j.models.embeddings.loader.WordVectorSerializer Java Examples

The following examples show how to use org.deeplearning4j.models.embeddings.loader.WordVectorSerializer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
 
Example #2
Source File: TSNEVisualizationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}
 
Example #3
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example #4
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testBackwardsCompatibleSequenceVectors() {
    File model_v3 = Resources.asFile("deeplearning4j-nlp/seqv_beta3.csv");
    File model_v4 = Resources.asFile("deeplearning4j-nlp/seqv_beta4.csv");
    try {
        SequenceVectors vectors1 = WordVectorSerializer.readSequenceVectors(new VocabWordFactory(), model_v3);
        SequenceVectors vectors2 = WordVectorSerializer.readSequenceVectors(new VocabWordFactory(), model_v4);

        assertEquals(vectors1.vocab().numWords(), vectors2.vocab().numWords());
        for (int i = 0; i < vectors1.vocab().numWords(); ++i) {
            assertEquals(vectors1.vocab().words().toArray()[i], vectors2.vocab().words().toArray()[i]);
        }
    } catch (Exception e) {
        fail(e.getMessage());
    }
}
 
Example #5
Source File: FastTextTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testWordsStatistics() throws IOException {
    File output = testDir.newFile();

    FastText fastText = FastText
            .builder()
            .supervised(true)
            .inputFile(inputFile.getAbsolutePath())
            .outputFile(output.getAbsolutePath())
            .build();

    log.info("\nTraining supervised model ...\n");
    fastText.fit();

    File file = new File(output.getAbsolutePath() + ".vec");
    Word2Vec word2Vec = WordVectorSerializer.readAsCsv(file);

    assertEquals(48, word2Vec.getVocab().numWords());
    assertEquals("", 0.1667751520872116, word2Vec.similarity("Football", "teams"), 2e-3);
    assertEquals("", 0.10083991289138794, word2Vec.similarity("professional", "minutes"), 2e-3);
    assertEquals("", Double.NaN, word2Vec.similarity("java","cpp"), 0.0);
    assertThat(word2Vec.wordsNearest("association", 3), hasItems("Football", "Soccer", "men's"));
}
 
Example #6
Source File: TSNEVisualizationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}
 
Example #7
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #8
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #9
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #10
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}
 
Example #11
Source File: Word2VecDemo.java    From COMP6237 with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public void run() {
	try {
		final File dataset = DataUtils.getDataLocation("GoogleNews-vectors-negative300-SLIM.bin.gz");

		if (!(dataset.exists())) {
			dataset.getParentFile().mkdirs();
			FileUtils.copyURLToFile(
					new URL("https://artist-cloud.ecs.soton.ac.uk/s/pTHe9m5PxZubnWB/download"),
					dataset);
			System.err.println("downloaded data");
		}

		vec = WordVectorSerializer.readWord2VecModel(dataset);
		System.err.println("loaded model");
	} catch (final IOException e) {
		System.err.println(e);
		e.printStackTrace();
		vec = null;
	}
}
 
Example #12
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests ZIP file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderArchive() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(w2v);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #13
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests binary file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(binaryFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #14
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example #15
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testFromTableAndVocab() throws IOException {

    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();

    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #16
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #17
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFastText_readWord2VecModel() {
    File[] files = { fastTextRaw, fastTextZip, fastTextGzip };
    for (File file : files) {
        try {
            Word2Vec word2Vec = WordVectorSerializer.readWord2VecModel(file);
            assertEquals(99, word2Vec.getVocab().numWords());
        } catch (Exception readCsvException) {
            fail("Failure for input file " + file.getAbsolutePath() + " " + readCsvException.getMessage());
        }
    }
}
 
Example #18
Source File: Word2VecTestsSmall.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 300000)
    public void testUnkSerialization_1() throws Exception {
        val inputFile = Resources.asFile("big/raw_sentences.txt");
//        val iter = new BasicLineIterator(inputFile);
        val iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
        val t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        val vec = new Word2Vec.Builder()
                .minWordFrequency(1)
                .epochs(1)
                .layerSize(300)
                .limitVocabularySize(1) // Limit the vocab size to 2 words
                .windowSize(5)
                .allowParallelTokenization(true)
                .batchSize(512)
                .learningRate(0.025)
                .minLearningRate(0.0001)
                .negativeSample(0.0)
                .sampling(0.0)
                .useAdaGrad(false)
                .useHierarchicSoftmax(true)
                .iterations(1)
                .useUnknown(true) // Using UNK with limited vocab size causes the issue
                .seed(42)
                .iterate(iter)
                .workers(4)
                .tokenizerFactory(t).build();

        vec.fit();

        val tmpFile = File.createTempFile("temp","temp");
        tmpFile.deleteOnExit();

        WordVectorSerializer.writeWord2VecModel(vec, tmpFile); // NullPointerException was thrown here
    }
 
Example #19
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method tests binary file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #20
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testVocabPeristence() throws Exception {
    val vocabA = new AbstractCache.Builder<VocabWord>().build();

    vocabA.addToken(new VocabWord(3.0, "alpha"));
    vocabA.addWordToIndex(1, "alpha");

    vocabA.addToken(new VocabWord(4.0, "beta"));
    vocabA.addWordToIndex(0, "beta");

    val tmpFile = File.createTempFile("sdsds","sfdsfdsgsdf");
    tmpFile.deleteOnExit();

    vocabA.setTotalWordOccurences(200);
    vocabA.incrementTotalDocCount(100);

    assertEquals(100, vocabA.totalNumberOfDocs());
    assertEquals(200, vocabA.totalWordOccurrences());

    WordVectorSerializer.writeVocabCache(vocabA, tmpFile);

    val vocabB = WordVectorSerializer.readVocabCache(tmpFile);

    assertEquals(vocabA.wordAtIndex(0), vocabB.wordAtIndex(0));
    assertEquals(vocabA.wordAtIndex(1), vocabB.wordAtIndex(1));

    assertEquals(vocabA.numWords(), vocabB.numWords());
    assertEquals(vocabA.totalNumberOfDocs(), vocabB.totalNumberOfDocs());
    assertEquals(vocabA.totalWordOccurrences(), vocabB.totalWordOccurrences());
}
 
Example #21
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testB64_1() throws Exception {
    String wordA = "night";
    String wordB = "night day";
    String encA = WordVectorSerializer.ReadHelper.encodeB64(wordA);
    String encB = WordVectorSerializer.ReadHelper.encodeB64(wordB);

    assertEquals(wordA, WordVectorSerializer.ReadHelper.decodeB64(encA));
    assertEquals(wordB, WordVectorSerializer.ReadHelper.decodeB64(encB));

    assertEquals(wordA, WordVectorSerializer.ReadHelper.decodeB64(wordA));
    assertEquals(wordB, WordVectorSerializer.ReadHelper.decodeB64(wordB));

}
 
Example #22
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderFromStream() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(new FileInputStream(binaryFile));

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #23
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFastText() {
    File[] files = { fastTextRaw, fastTextZip, fastTextGzip };
    for (File file : files) {
        try {
            Word2Vec word2Vec = WordVectorSerializer.readAsCsv(file);
            assertEquals(99, word2Vec.getVocab().numWords());
        } catch (Exception readCsvException) {
            fail("Failure for input file " + file.getAbsolutePath() + " " + readCsvException.getMessage());
        }
    }
}
 
Example #24
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example #25
Source File: FlatModelUtilsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
    if (vec == null) {
        //vec = WordVectorSerializer.loadFullModel("/Users/raver119/develop/model.dat");
        vec = WordVectorSerializer.loadFullModel("/ext/Temp/Models/model.dat");
        //vec = WordVectorSerializer.loadFullModel("/ext/Temp/Models/raw_sentences.dat");
    }
}
 
Example #26
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method here is only to test real google model few gigabytes worth
 * Keep it ignored, since it requirs full google model being present in system, which is 1.6gb compressed
 *
 * @throws Exception
 */
@Test
@Ignore
public void testStaticLoaderGoogleModel() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    long time1 = System.currentTimeMillis();
    WordVectors vectors = WordVectorSerializer
                    .loadStaticModel(new File("C:\\Users\\raver\\develop\\GoogleNews-vectors-negative300.bin.gz"));
    long time2 = System.currentTimeMillis();

    logger.info("Loading time: {} ms", (time2 - time1));
}
 
Example #27
Source File: GoogleNewsVectorExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) {
    try{
        File file = new File("{PATH-TO-GOOGLE-WORD-VECTOR}");
        Word2Vec model = WordVectorSerializer.readWord2VecModel(file);
        System.out.println(Arrays.asList(model.wordsNearest("season",10)));
    } catch(ND4JIllegalStateException e){
        System.out.println("Please provide proper directory path in place of: PATH-TO-GOOGLE-WORD-VECTOR");
    }
}
 
Example #28
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testLoader() throws Exception {
    WordVectors vec = WordVectorSerializer.loadTxtVectors(new File("/home/raver119/Downloads/_vectors.txt"));

    logger.info("Rewinding: " + Arrays.toString(vec.getWordVector("rewinding")));
}
 
Example #29
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
Example #30
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method tests CSV file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}