org.deeplearning4j.models.word2vec.Word2Vec Java Examples

The following examples show how to use org.deeplearning4j.models.word2vec.Word2Vec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example #2
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public static Word2Vec readAsBinaryNoLineBreaks(@NonNull InputStream inputStream) {
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();

    // try to load without linebreaks
    try {
        if (originalPeriodic) {
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        }

        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

        return readBinaryModel(inputStream, false, false);
    } catch (Exception readModelException) {
        log.error("Cannot read binary model", readModelException);
        throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
    }
}
 
Example #3
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method loads Word2Vec model from binary input stream.
 *
 * @param inputStream  binary input stream
 * @return Word2Vec
 */
public static Word2Vec readAsBinary(@NonNull InputStream inputStream) {
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();

    // we fallback to trying binary model instead
    try {
        log.debug("Trying binary model restoration...");

        if (originalPeriodic) {
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        }

        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

        return readBinaryModel(inputStream, true, false);
    } catch (Exception readModelException) {
        throw new RuntimeException(readModelException);
    }
}
 
Example #4
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example #5
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method loads Word2Vec model from input stream
 *
 * @param stream InputStream
 * @param readExtendedTable boolean
 * @return Word2Vec
 */
public static Word2Vec readWord2Vec(
            @NonNull InputStream stream,
            boolean readExtendedTable) throws IOException {
    SequenceVectors<VocabWord> vectors = readSequenceVectors(stream, readExtendedTable);

    Word2Vec word2Vec = new Word2Vec
            .Builder(vectors.getConfiguration())
            .layerSize(vectors.getLayerSize())
            .build();
    word2Vec.setVocab(vectors.getVocab());
    word2Vec.setLookupTable(vectors.lookupTable());
    word2Vec.setModelUtils(vectors.getModelUtils());

    return word2Vec;
}
 
Example #6
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
 
Example #7
Source File: Word2VecIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testLabeledExample() throws Exception {

    INDArray unk = vec.getWordVectorMatrix(Word2Vec.DEFAULT_UNK);
    assertNotEquals(null, unk);

    unk = vec.getWordVectorMatrix("2131241sdasdas");
    assertNotEquals(null, unk);

    ClassPathResource resource = new ClassPathResource("/labeled/");
    File dir = testDir.newFolder();
    resource.copyDirectory(dir);

    Word2VecDataSetIterator iter = new Word2VecDataSetIterator(vec,
                    new LabelAwareFileSentenceIterator(null, dir),
                    Arrays.asList("negative", "positive", "neutral"));
    DataSet next = iter.next();

}
 
Example #8
Source File: Word2VecIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File dir = testDir.newFolder();
        resource.copyDirectory(dir);
        SentenceIterator iter = UimaSentenceIterator.createWithPath(dir.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}
 
Example #9
Source File: VectorsConfigurationTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testFromW2V() throws Exception {
    VectorsConfiguration configuration = new VectorsConfiguration();
    configuration.setHugeModelExpected(true);
    configuration.setWindow(5);
    configuration.setIterations(3);
    configuration.setLayersSize(200);
    configuration.setLearningRate(1.4d);
    configuration.setSampling(0.0005d);
    configuration.setMinLearningRate(0.25d);
    configuration.setEpochs(1);

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());

    Word2Vec vec = new Word2Vec.Builder(configuration).iterate(iter).build();

    VectorsConfiguration configuration2 = vec.getConfiguration();

    assertEquals(configuration, configuration2);
}
 
Example #10
Source File: PerformanceTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}
 
Example #11
Source File: Word2VecCN.java    From word2vec with Apache License 2.0 6 votes vote down vote up
public Word2Vec fit() {
  log.info("Building model....");
  Word2Vec vec =
      new Word2Vec.Builder()
          .minWordFrequency(minWordFrequency)
          .iterations(iterations)
          .layerSize(layerSize)
          .seed(seed)
          .windowSize(windowSize)
          .iterate(sentenceIterator)
          .tokenizerFactory(tokenizerFactory)
          .build();

  log.info("Fitting Word2Vec model....");
  vec.fit();
  return vec;
}
 
Example #12
Source File: WindowConverter.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Converts a window (each word in the window)
 *
 * in to a vector.
 *
 * Keep in mind each window is a multi word context.
 *
 * From there, each word uses the passed in model
 * as a lookup table to get what vectors are relevant
 * to the passed in windows
 * @param window the window to take in.
 * @param vec the model to use as a lookup table
 * @return a concacneated 1 row array
 * containing all of the numbers for each word in the window
 */
public static INDArray asExampleArray(Window window, Word2Vec vec, boolean normalize) {
    int length = vec.lookupTable().layerSize();
    List<String> words = window.getWords();
    int windowSize = vec.getWindow();
    Preconditions.checkState(words.size() == vec.getWindow());
    INDArray ret = Nd4j.create(1, length * windowSize);



    for (int i = 0; i < words.size(); i++) {
        String word = words.get(i);
        INDArray n = normalize ? vec.getWordVectorMatrixNormalized(word) : vec.getWordVectorMatrix(word);
        ret.put(new INDArrayIndex[] {NDArrayIndex.interval(i * vec.lookupTable().layerSize(),
                        i * vec.lookupTable().layerSize() + vec.lookupTable().layerSize())}, n);
    }

    return ret;
}
 
Example #13
Source File: FastTextTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testWordsStatistics() throws IOException {
    File output = testDir.newFile();

    FastText fastText = FastText
            .builder()
            .supervised(true)
            .inputFile(inputFile.getAbsolutePath())
            .outputFile(output.getAbsolutePath())
            .build();

    log.info("\nTraining supervised model ...\n");
    fastText.fit();

    File file = new File(output.getAbsolutePath() + ".vec");
    Word2Vec word2Vec = WordVectorSerializer.readAsCsv(file);

    assertEquals(48, word2Vec.getVocab().numWords());
    assertEquals("", 0.1667751520872116, word2Vec.similarity("Football", "teams"), 2e-3);
    assertEquals("", 0.10083991289138794, word2Vec.similarity("professional", "minutes"), 2e-3);
    assertEquals("", Double.NaN, word2Vec.similarity("java","cpp"), 0.0);
    assertThat(word2Vec.wordsNearest("association", 3), hasItems("Football", "Soccer", "men's"));
}
 
Example #14
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method just loads full compressed model.
 */
private static Word2Vec readAsExtendedModel(@NonNull File file) throws IOException {
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();

    log.debug("Trying full model restoration...");

    if (originalPeriodic) {
        Nd4j.getMemoryManager().togglePeriodicGc(true);
    }

    Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

    return readWord2Vec(file);
}
 
Example #15
Source File: GoogleNewsVectorExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) {
    try{
        File file = new File("{PATH-TO-GOOGLE-WORD-VECTOR}");
        Word2Vec model = WordVectorSerializer.readWord2VecModel(file);
        System.out.println(Arrays.asList(model.wordsNearest("season",10)));
    } catch(ND4JIllegalStateException e){
        System.out.println("Please provide proper directory path in place of: PATH-TO-GOOGLE-WORD-VECTOR");
    }
}
 
Example #16
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method
 * 1) Binary model, either compressed or not. Like well-known Google Model
 * 2) Popular CSV word2vec text format
 * 3) DL4j compressed format
 * <p>
 * Please note: if extended data isn't available, only weights will be loaded instead.
 *
 * @param file  model file
 * @param extendedModel  if TRUE, we'll try to load HS states & Huffman tree info, if FALSE, only weights will be loaded
 * @return word2vec model
 */
public static Word2Vec readWord2VecModel(File file, boolean extendedModel) {
    if (!file.exists() || !file.isFile()) {
        throw new ND4JIllegalStateException("File [" + file.getAbsolutePath() + "] doesn't exist");
    }

    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic) {
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    }
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);

    try {
        return readWord2Vec(file, extendedModel);
    } catch (Exception readSequenceVectors) {
        try {
            return extendedModel
                    ? readAsExtendedModel(file)
                    : readAsSimplifiedModel(file);
        } catch (Exception loadFromFileException) {
            try {
                return readAsCsv(file);
            } catch (Exception readCsvException) {
                try {
                    return readAsBinary(file);
                } catch (Exception readBinaryException) {
                    try {
                        return readAsBinaryNoLineBreaks(file);
                    } catch (Exception readModelException) {
                        log.error("Unable to guess input file format", readModelException);
                        throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
                    }
                }
            }
        }
    }
}
 
Example #17
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Word2Vec readAsBinaryNoLineBreaks(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsBinaryNoLineBreaks(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}
 
Example #18
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Write the tsne format
 *
 * @param vec  the word vectors to use for labeling
 * @param tsne the tsne array to write
 * @param csv  the file to use
 * @throws Exception
 */
public static void writeTsneFormat(Word2Vec vec, INDArray tsne, File csv) throws Exception {
    try (BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), StandardCharsets.UTF_8))) {
        int words = 0;
        InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
        for (String word : vec.vocab().words()) {
            if (word == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
            for (int j = 0; j < wordVector.length(); j++) {
                sb.append(wordVector.getDouble(j));
                if (j < wordVector.length() - 1) {
                    sb.append(",");
                }
            }
            sb.append(",");
            sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
            sb.append(" ");

            sb.append("\n");
            write.write(sb.toString());

        }

        log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
    }
}
 
Example #19
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Word2Vec readAsBinary(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsBinary(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}
 
Example #20
Source File: Word2VecDataSetIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }
 
Example #21
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Word2Vec readAsCsv(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsCsv(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}
 
Example #22
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 300000)
public void testJSONSerialization() {
    ParagraphVectors paragraphVectors = new ParagraphVectors.Builder().build();
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    val words = new VocabWord[3];
    words[0] = new VocabWord(1.0, "word");
    words[1] = new VocabWord(2.0, "test");
    words[2] = new VocabWord(3.0, "tester");

    for (int i = 0; i < words.length; ++i) {
        cache.addToken(words[i]);
        cache.addWordToIndex(i, words[i].getLabel());
    }
    paragraphVectors.setVocab(cache);

    String json = null;
    Word2Vec unserialized = null;
    try {
        json = paragraphVectors.toJson();
        log.info("{}", json.toString());

        unserialized = ParagraphVectors.fromJson(json);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }

    assertEquals(cache.totalWordOccurrences(), ((ParagraphVectors) unserialized).getVocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), ((ParagraphVectors) unserialized).getVocab().totalNumberOfDocs());

    for (int i = 0; i < words.length; ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = ((ParagraphVectors) unserialized).getVocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }
}
 
Example #23
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method loads Word2Vec model from csv file
 *
 * @param inputStream  input stream
 * @return Word2Vec model
 */
public static Word2Vec readAsCsv(@NonNull InputStream inputStream) {
    VectorsConfiguration configuration = new VectorsConfiguration();

    // let's try to load this file as csv file
    try {
        log.debug("Trying CSV model restoration...");

        Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(inputStream);
        Word2Vec.Builder builder = new Word2Vec
                .Builder()
                .lookupTable(pair.getFirst())
                .useAdaGrad(false)
                .vocabCache(pair.getSecond())
                .layerSize(pair.getFirst().layerSize())
                // we don't use hs here, because model is incomplete
                .useHierarchicSoftmax(false)
                .resetModel(false);

        TokenizerFactory factory = getTokenizerFactory(configuration);
        if (factory != null) {
            builder.tokenizerFactory(factory);
        }

        return builder.build();
    } catch (Exception ex) {
        throw new RuntimeException("Unable to load model in CSV format");
    }
}
 
Example #24
Source File: Word2VecDataFetcher.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public Word2VecDataFetcher(String path, Word2Vec vec, List<String> labels) {
    if (vec == null || labels == null || labels.isEmpty())
        throw new IllegalArgumentException(
                        "Unable to initialize due to missing argument or empty label applyTransformToDestination");
    this.vec = vec;
    this.labels = labels;
    this.path = path;
}
 
Example #25
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method saves Word2Vec model to output stream
 *
 * @param word2Vec Word2Vec
 * @param stream OutputStream
 */
public static void writeWord2Vec(@NonNull Word2Vec word2Vec, @NonNull OutputStream stream)
        throws IOException {

    SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(word2Vec.getConfiguration())
            .layerSize(word2Vec.getLayerSize()).build();
    vectors.setVocab(word2Vec.getVocab());
    vectors.setLookupTable(word2Vec.getLookupTable());
    vectors.setModelUtils(word2Vec.getModelUtils());
    writeSequenceVectors(vectors, stream);
}
 
Example #26
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method loads Word2Vec model from file
 *
 * @param file File
 * @param readExtendedTables boolean
 * @return Word2Vec
 */
public static Word2Vec readWord2Vec(@NonNull File file, boolean readExtendedTables) {
    try (InputStream inputStream = fileStream(file)) {
        return readWord2Vec(inputStream, readExtendedTables);
    } catch (Exception readSequenceVectors) {
        throw new RuntimeException(readSequenceVectors);
    }
}
 
Example #27
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677
public void testDirectInference() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator sentencesIter = getIterator(isIntegration, resource);

    ClassPathResource resource_mixed = new ClassPathResource("paravec/");
    File local_resource_mixed = testDir.newFolder();
    resource_mixed.copyDirectory(local_resource_mixed);
    SentenceIterator iter = new AggregatingSentenceIterator.Builder()
                    .addSentenceIterator(sentencesIter)
                    .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1)
                    .learningRate(0.025).layerSize(150).minLearningRate(0.001)
                    .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    wordVectors.fit();

    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10)
                    .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors)
                    .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #28
Source File: WordConverter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static INDArray toInputMatrix(List<Window> windows, Word2Vec vec) {
    int columns = vec.lookupTable().layerSize() * vec.getWindow();
    int rows = windows.size();
    INDArray ret = Nd4j.create(rows, columns);
    for (int i = 0; i < rows; i++) {
        ret.putRow(i, WindowConverter.asExampleMatrix(windows.get(i), vec));
    }
    return ret;
}
 
Example #29
Source File: InMemoryLookupTable.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * @param word
 * @return
 */
@Override
public INDArray vector(String word) {
    if (word == null)
        return null;
    int idx = vocab.indexOf(word);
    if (idx < 0) {
        idx = vocab.indexOf(Word2Vec.DEFAULT_UNK);
        if (idx < 0)
            return null;
    }
    return syn0.getRow(idx, true);
}
 
Example #30
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Writes the word vectors to the given BufferedWriter. Note that this assumes an in memory cache.
 * BufferedWriter can be writer to local file, or hdfs file, or any compatible to java target.
 *
 * @param vec    the word2vec to write
 * @param writer - BufferedWriter, where all data should be written to
 *               the path to write
 * @deprecated Use {@link #writeWord2Vec(Word2Vec, OutputStream)}
 */
@Deprecated
public static void writeWordVectors(@NonNull Word2Vec vec, @NonNull BufferedWriter writer) throws IOException {
    int words = 0;

    String str = vec.getVocab().numWords() + " " + vec.getLayerSize() + " " + vec.getVocab().totalNumberOfDocs();
    log.debug("Saving header: {}", str);
    writer.write(str + "\n");

    for (String word : vec.vocab().words()) {
        if (word == null) {
            continue;
        }
        StringBuilder sb = new StringBuilder();
        sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
        sb.append(" ");
        INDArray wordVector = vec.getWordVectorMatrix(word);
        for (int j = 0; j < wordVector.length(); j++) {
            sb.append(wordVector.getDouble(j));
            if (j < wordVector.length() - 1) {
                sb.append(" ");
            }
        }
        sb.append("\n");
        writer.write(sb.toString());
        words++;
    }

    try {
        writer.flush();
    } catch (Exception e) {
    }
    log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
}