org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable Java Examples

The following examples show how to use org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TSNEVisualizationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}
 
Example #2
Source File: TSNEVisualizationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}
 
Example #3
Source File: DM.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    cbow.configure(vocabCache, lookupTable, configuration);

    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();

    this.syn0 = ((InMemoryLookupTable<T>) lookupTable).getSyn0();
    this.syn1 = ((InMemoryLookupTable<T>) lookupTable).getSyn1();
    this.syn1Neg = ((InMemoryLookupTable<T>) lookupTable).getSyn1Neg();
    this.expTable = ((InMemoryLookupTable<T>) lookupTable).getExpTable();
    this.table = ((InMemoryLookupTable<T>) lookupTable).getTable();
}
 
Example #4
Source File: BasicModelUtils.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected INDArray adjustRank(INDArray words) {
    if (lookupTable instanceof InMemoryLookupTable) {
        InMemoryLookupTable l = (InMemoryLookupTable) lookupTable;

        INDArray syn0 = l.getSyn0();
        if (!words.dataType().equals(syn0.dataType())) {
            return words.castTo(syn0.dataType());
        }
        if (words.rank() == 0 || words.rank() > 2) {
            throw new IllegalStateException("Invalid rank for wordsNearest method");
        } else if (words.rank() == 1) {
            return words.reshape(1, -1);
        }
    }
    return words;
}
 
Example #5
Source File: WordVectorsImpl.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method returns 2D array, where each row represents corresponding label
 *
 * @param labels
 * @return
 */
@Override
public INDArray getWordVectors(@NonNull Collection<String> labels) {
    int indexes[] = new int[labels.size()];
    int cnt = 0;
    boolean useIndexUnknown = useUnknown && vocab.containsWord(getUNK());

    for (String label : labels) {
        if (vocab.containsWord(label)) {
            indexes[cnt] = vocab.indexOf(label);
        } else
            indexes[cnt] = useIndexUnknown ? vocab.indexOf(getUNK()) : -1;
        cnt++;
    }

    while (ArrayUtils.contains(indexes, -1)) {
        indexes = ArrayUtils.removeElement(indexes, -1);
    }
    if (indexes.length == 0) {
            return Nd4j.empty(((InMemoryLookupTable)lookupTable).getSyn0().dataType());
    }

    INDArray result = Nd4j.pullRows(lookupTable.getWeights(), 1, indexes);
    return result;
}
 
Example #6
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #7
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #8
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}
 
Example #9
Source File: Word2VecParam.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public Word2VecParam(boolean useAdaGrad, double negative, int numWords, INDArray table, int window,
                AtomicLong nextRandom, double alpha, double minAlpha, int totalWords, int lastChecked,
                Broadcast<AtomicLong> wordCount, InMemoryLookupTable weights, int vectorLength,
                Broadcast<double[]> expTable) {
    this.useAdaGrad = useAdaGrad;
    this.negative = negative;
    this.numWords = numWords;
    this.table = table;
    this.window = window;
    this.nextRandom = nextRandom;
    this.alpha = alpha;
    this.minAlpha = minAlpha;
    this.totalWords = totalWords;
    this.lastChecked = lastChecked;
    this.wordCount = wordCount;
    this.weights = weights;
    this.vectorLength = vectorLength;
    this.expTable = expTable;
}
 
Example #10
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testFromTableAndVocab() throws IOException {

    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();

    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #11
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #12
Source File: SequenceVectors.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
private void initIntersectVectors() {
    if (intersectModel != null && intersectModel.vocab().numWords() > 0) {
        List<Integer> indexes = new ArrayList<>();
        for (int i = 0; i < intersectModel.vocab().numWords(); ++i) {
            String externalWord = intersectModel.vocab().wordAtIndex(i);
            int index = this.vocab.indexOf(externalWord);
            if (index >= 0) {
                this.vocab.wordFor(externalWord).setLocked(lockFactor);
                indexes.add(index);
            }
        }

        if (indexes.size() > 0) {
            int[] intersectIndexes = Ints.toArray(indexes);

            Nd4j.scatterUpdate(org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate.UpdateOp.ASSIGN,
                    ((InMemoryLookupTable<VocabWord>) lookupTable).getSyn0(),
                    Nd4j.createFromArray(intersectIndexes),
                    ((InMemoryLookupTable<VocabWord>) intersectModel.lookupTable()).getSyn0(),
                    1);
        }
    }
}
 
Example #13
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Writes the word vectors to the given path. Note that this assumes an in memory cache
 *
 * @param lookupTable
 * @param cache
 * @param path        the path to write
 * @throws IOException
 * @deprecated Use {@link #writeWord2VecModel(Word2Vec, File)} instead
 */
@Deprecated
public static void writeWordVectors(InMemoryLookupTable lookupTable, InMemoryLookupCache cache, String path)
        throws IOException {
    try (BufferedWriter write = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(path, false), StandardCharsets.UTF_8))) {
        for (int i = 0; i < lookupTable.getSyn0().rows(); i++) {
            String word = cache.wordAtIndex(i);
            if (word == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
            sb.append(" ");
            INDArray wordVector = lookupTable.vector(word);
            for (int j = 0; j < wordVector.length(); j++) {
                sb.append(wordVector.getDouble(j));
                if (j < wordVector.length() - 1) {
                    sb.append(" ");
                }
            }
            sb.append("\n");
            write.write(sb.toString());

        }
    }
}
 
Example #14
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Load word vectors from the given pair
 *
 * @param pair the given pair
 * @return a read only word vectors impl based on the given lookup table and vocab
 */
public static Word2Vec fromPair(Pair<InMemoryLookupTable, VocabCache> pair) {
    Word2Vec vectors = new Word2Vec();
    vectors.setLookupTable(pair.getFirst());
    vectors.setVocab(pair.getSecond());
    vectors.setModelUtils(new BasicModelUtils());
    return vectors;
}
 
Example #15
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method allows you to use pre-built WordVectors model (e.g. Word2Vec) for ParagraphVectors.
 * Existing model will be transferred into new model before training starts.
 *
 * PLEASE NOTE: Non-normalized model is recommended to use here.
 *
 * @param vec existing WordVectors model
 * @return
 */
@Override
@SuppressWarnings("unchecked")
public Builder useExistingWordVectors(@NonNull WordVectors vec) {
    if (((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1() == null
                    && ((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1Neg() == null)
        throw new ND4JIllegalStateException("Model being passed as existing has no syn1/syn1Neg available");

    this.existingVectors = vec;
    return this;
}
 
Example #16
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Pair<InMemoryLookupTable, VocabCache> loadTxt(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return loadTxt(inputStream);
    } catch (IOException readTestException) {
        throw new RuntimeException(readTestException);
    }
}
 
Example #17
Source File: Word2VecRawTextExample.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
Example #18
Source File: CBOW.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();

    if (configuration.getNegative() > 0) {
        if (((InMemoryLookupTable<T>) lookupTable).getSyn1Neg() == null) {
            logger.info("Initializing syn1Neg...");
            ((InMemoryLookupTable<T>) lookupTable).setUseHS(configuration.isUseHierarchicSoftmax());
            ((InMemoryLookupTable<T>) lookupTable).setNegative(configuration.getNegative());
            ((InMemoryLookupTable<T>) lookupTable).resetWeights(false);
        }
    }


    this.syn0 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn0());
    this.syn1 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1());
    this.syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
    //this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable()));
    this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable(),
            new long[]{((InMemoryLookupTable<T>) lookupTable).getExpTable().length}, syn0.get().dataType()));
    this.table = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getTable());
    this.variableWindows = configuration.getVariableWindows();
}
 
Example #19
Source File: SkipGram.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * SkipGram initialization over given vocabulary and WeightLookupTable
 *
 * @param vocabCache
 * @param lookupTable
 * @param configuration
 */
@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    if (configuration.getNegative() > 0) {
        if (((InMemoryLookupTable<T>) lookupTable).getSyn1Neg() == null) {
            log.info("Initializing syn1Neg...");
            ((InMemoryLookupTable<T>) lookupTable).setUseHS(configuration.isUseHierarchicSoftmax());
            ((InMemoryLookupTable<T>) lookupTable).setNegative(configuration.getNegative());
            ((InMemoryLookupTable<T>) lookupTable).resetWeights(false);
        }
    }

    this.syn0 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn0());
    this.syn1 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1());
    this.syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
    this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable(),
                                           new long[]{((InMemoryLookupTable<T>) lookupTable).getExpTable().length}, syn0.get().dataType()));
    this.table = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getTable());



    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();
    this.variableWindows = configuration.getVariableWindows();

    this.vectorLength = configuration.getLayersSize();
}
 
Example #20
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method loads Word2Vec model from csv file
 *
 * @param inputStream  input stream
 * @return Word2Vec model
 */
public static Word2Vec readAsCsv(@NonNull InputStream inputStream) {
    VectorsConfiguration configuration = new VectorsConfiguration();

    // let's try to load this file as csv file
    try {
        log.debug("Trying CSV model restoration...");

        Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(inputStream);
        Word2Vec.Builder builder = new Word2Vec
                .Builder()
                .lookupTable(pair.getFirst())
                .useAdaGrad(false)
                .vocabCache(pair.getSecond())
                .layerSize(pair.getFirst().layerSize())
                // we don't use hs here, because model is incomplete
                .useHierarchicSoftmax(false)
                .resetModel(false);

        TokenizerFactory factory = getTokenizerFactory(configuration);
        if (factory != null) {
            builder.tokenizerFactory(factory);
        }

        return builder.build();
    } catch (Exception ex) {
        throw new RuntimeException("Unable to load model in CSV format");
    }
}
 
Example #21
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method saves table of weights to file
 *
 * @param weightLookupTable WeightLookupTable
 * @param file File
 */
public static <T extends SequenceElement>  void writeLookupTable(WeightLookupTable<T> weightLookupTable,
                                                                 @NonNull File file) throws IOException {
    try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),
                                                                            StandardCharsets.UTF_8))) {
        int numWords = weightLookupTable.getVocabCache().numWords();
        int layersSize = weightLookupTable.layerSize();
        long totalNumberOfDocs = weightLookupTable.getVocabCache().totalNumberOfDocs();

        String format = "%d %d %d\n";
        String header = String.format(format, numWords, layersSize, totalNumberOfDocs);

        writer.write(header);

        String row = "";
        for (int j = 0; j < weightLookupTable.getVocabCache().words().size(); ++j) {
            String label =  weightLookupTable.getVocabCache().wordAtIndex(j);
            row += label + " ";
            int freq = weightLookupTable.getVocabCache().wordFrequency(label);
            int rows = ((InMemoryLookupTable)weightLookupTable).getSyn0().rows();
            int cols = ((InMemoryLookupTable)weightLookupTable).getSyn0().columns();
            row += freq + " " + rows + " " + cols + " ";

            for (int r = 0; r < rows; ++r) {
                //row += " ";
                for (int c = 0; c < cols; ++c) {
                    row += ((InMemoryLookupTable) weightLookupTable).getSyn0().getDouble(r, c) + " ";
                }
                //row += " ";
            }
            row += "\n";
        }
        writer.write(row);
    }
}
 
Example #22
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void ParaVec_Correct_WhenDeserialized() {

    INDArray syn0 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1Neg = Nd4j.rand(DataType.FLOAT, 10, 2);

    InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable
            .Builder<VocabWord>()
            .useAdaGrad(false)
            .cache(cache)
            .build();

    lookupTable.setSyn0(syn0);
    lookupTable.setSyn1(syn1);
    lookupTable.setSyn1Neg(syn1Neg);

    ParagraphVectors paragraphVectors = new ParagraphVectors.Builder()
            .vocabCache(cache)
            .lookupTable(lookupTable)
            .build();

    Word2Vec deser = null;
    try {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        WordVectorSerializer.writeWord2Vec(paragraphVectors, baos);
        byte[] bytesResult = baos.toByteArray();
        deser = WordVectorSerializer.readWord2Vec(new ByteArrayInputStream(bytesResult), true);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }

    assertNotNull(paragraphVectors.getConfiguration());
    assertEquals(paragraphVectors.getConfiguration(), deser.getConfiguration());

    assertEquals(cache.totalWordOccurrences(),deser.vocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), deser.vocab().totalNumberOfDocs());
    assertEquals(cache.numWords(), deser.vocab().numWords());

    for (int i = 0; i < cache.words().size(); ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = deser.vocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }

}
 
Example #23
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 300000)
public void testParagraphVectorsDBOW() throws Exception {
    skipUnlessIntegrationTests();

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1)
                    .layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
                    .trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0)
                    .allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(4)
                    .usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();

    vec.fit();

    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn0().isAttached());
    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn1().isAttached());

    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);

    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    // testing DM inference now

    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");

    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));

    assertNotEquals(inferredA1, inferredC1);

    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());

    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);

}
 
Example #24
Source File: CBOW.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public void iterateSample(T currentWord, int[] windowWords, boolean[] wordStatuses, AtomicLong nextRandom, double alpha,
                          boolean isInference, int numLabels, boolean trainWords, INDArray inferenceVector) {
    int[] idxSyn1 = null;
    byte[] codes = null;

    if (configuration.isUseHierarchicSoftmax()) {
        idxSyn1 = new int[currentWord.getCodeLength()];
        codes = new byte[currentWord.getCodeLength()];
        for (int p = 0; p < currentWord.getCodeLength(); p++) {
            if (currentWord.getPoints().get(p) < 0)
                continue;

            codes[p] = currentWord.getCodes().get(p);
            idxSyn1[p] = currentWord.getPoints().get(p);
        }
    } else {
        idxSyn1 = new int[0];
        codes = new byte[0];
    }


    if (negative > 0) {
        if (syn1Neg == null) {
            ((InMemoryLookupTable<T>) lookupTable).initNegative();
            syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
        }
    }

    if (batches.get() == null)
        batches.set(new ArrayList<Aggregate>());

    /*AggregateCBOW(syn0.get(), syn1.get(), syn1Neg.get(), expTable.get(), table.get(),
            currentWord.getIndex(), windowWords, idxSyn1, codes, (int) negative, currentWord.getIndex(),
            lookupTable.layerSize(), alpha, nextRandom.get(), vocabCache.numWords(), numLabels, trainWords,
            inferenceVector);*/

    boolean useHS = configuration.isUseHierarchicSoftmax();
    boolean useNegative = configuration.getNegative() > 0;

    int[] inputStatuses = new int[windowWords.length];
    for (int i = 0; i < windowWords.length; ++i) {
        if (i < wordStatuses.length)
            inputStatuses[i] = wordStatuses[i] ? 1 : 0;
        else
            inputStatuses[i] = -1;
    }
    INDArray wordsStatuses = Nd4j.createFromArray(inputStatuses);

    CbowRound cbow = null;

    if (useHS && useNegative) {
        cbow = new CbowRound(Nd4j.scalar(currentWord.getIndex()), Nd4j.createFromArray(windowWords),
                wordsStatuses,
                Nd4j.scalar(currentWord.getIndex()),
                syn0.get(), syn1.get(), syn1Neg.get(),
                expTable.get(), table.get(), Nd4j.createFromArray(idxSyn1), Nd4j.createFromArray(codes),
                (int)negative, Nd4j.scalar(alpha), Nd4j.scalar(nextRandom.get()),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()),
                Nd4j.empty(DataType.INT),
                trainWords,
                workers);
    }
    else if (useHS) {
        cbow = new CbowRound(currentWord.getIndex(), windowWords, wordsStatuses.toIntVector(),
                syn0.get(), syn1.get(),
                expTable.get(), idxSyn1, codes, alpha, nextRandom.get(),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()), 0);
    }
    else if (useNegative) {
        cbow = new CbowRound(currentWord.getIndex(), windowWords, wordsStatuses.toIntVector(), currentWord.getIndex(),
                syn0.get(), syn1Neg.get(),
                expTable.get(), table.get(), (int)negative, alpha, nextRandom.get(),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()), 0);
    }

    nextRandom.set(Math.abs(nextRandom.get() * 25214903917L + 11));
    Nd4j.getExecutioner().exec(cbow);

    /*if (!isInference) {
        batches.get().add(cbow);
        if (batches.get().size() > 4096) {
            Nd4j.getExecutioner().exec(batches.get());
            batches.get().clear();
        }
    } else
        Nd4j.getExecutioner().exec(cbow);*/

}
 
Example #25
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void weightLookupTable_Correct_WhenDeserialized() throws Exception {

    INDArray syn0 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1Neg = Nd4j.rand(DataType.FLOAT, 10, 2);

    InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable
            .Builder<VocabWord>()
            .useAdaGrad(false)
            .cache(cache)
            .build();

    lookupTable.setSyn0(syn0);
    lookupTable.setSyn1(syn1);
    lookupTable.setSyn1Neg(syn1Neg);

    File dir = testDir.newFolder();
    File file = new File(dir, "lookupTable.txt");

    WeightLookupTable<VocabWord> deser = null;
    try {
        WordVectorSerializer.writeLookupTable(lookupTable, file);
        deser = WordVectorSerializer.readLookupTable(file);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }
    assertEquals(lookupTable.getVocab().totalWordOccurrences(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().totalNumberOfDocs());
    assertEquals(cache.numWords(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().numWords());

    for (int i = 0; i < cache.words().size(); ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = ((InMemoryLookupTable<VocabWord>)deser).getVocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }

    assertEquals(lookupTable.getSyn0().columns(), ((InMemoryLookupTable<VocabWord>) deser).getSyn0().columns());
    assertEquals(lookupTable.getSyn0().rows(), ((InMemoryLookupTable<VocabWord>) deser).getSyn0().rows());
    for (int c = 0; c < ((InMemoryLookupTable<VocabWord>) deser).getSyn0().columns(); ++c) {
        for (int r = 0; r < ((InMemoryLookupTable<VocabWord>) deser).getSyn0().rows(); ++r) {
            assertEquals(lookupTable.getSyn0().getDouble(r,c),
                        ((InMemoryLookupTable<VocabWord>) deser).getSyn0().getDouble(r,c), 1e-5);
        }
    }
}
 
Example #26
Source File: TsneTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testSimple() throws Exception {
    //Simple sanity check

    for( int test=0; test <=1; test++){
        boolean syntheticData = test == 1;
        WorkspaceMode wsm = test == 0 ? WorkspaceMode.NONE : WorkspaceMode.ENABLED;
        log.info("Starting test: WSM={}, syntheticData={}", wsm, syntheticData);

        //STEP 1: Initialization
        int iterations = 50;
        //create an n-dimensional array of doubles
        Nd4j.setDefaultDataTypes(DataType.FLOAT, DataType.FLOAT);
        List<String> cacheList = new ArrayList<>(); //cacheList is a dynamic array of strings used to hold all words

        //STEP 2: Turn text input into a list of words
        INDArray weights;
        if(syntheticData){
            weights = Nd4j.rand(250, 200);
        } else {
            log.info("Load & Vectorize data....");
            File wordFile = new ClassPathResource("deeplearning4j-tsne/words.txt").getFile();   //Open the file
            //Get the data of all unique word vectors
            Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
            VocabCache cache = vectors.getSecond();
            weights = vectors.getFirst().getSyn0();    //seperate weights of unique words into their own list

            for (int i = 0; i < cache.numWords(); i++)   //seperate strings of words into their own list
                cacheList.add(cache.wordAtIndex(i));
        }

        //STEP 3: build a dual-tree tsne to use later
        log.info("Build model....");
        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                .setMaxIter(iterations)
                .theta(0.5)
                .normalize(false)
                .learningRate(500)
                .useAdaGrad(false)
                .workspaceMode(wsm)
                .build();


        //STEP 4: establish the tsne values and save them to a file
        log.info("Store TSNE Coordinates for Plotting....");
        File outDir = testDir.newFolder();
        tsne.fit(weights);
        tsne.saveAsFile(cacheList, new File(outDir, "out.txt").getAbsolutePath());
    }
}
 
Example #27
Source File: TsneTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testPerformance() throws Exception {

    StopWatch watch = new StopWatch();
    watch.start();
    for( int test=0; test <=1; test++){
        boolean syntheticData = test == 1;
        WorkspaceMode wsm = test == 0 ? WorkspaceMode.NONE : WorkspaceMode.ENABLED;
        log.info("Starting test: WSM={}, syntheticData={}", wsm, syntheticData);

        //STEP 1: Initialization
        int iterations = 50;
        //create an n-dimensional array of doubles
        Nd4j.setDefaultDataTypes(DataType.FLOAT, DataType.FLOAT);
        List<String> cacheList = new ArrayList<>(); //cacheList is a dynamic array of strings used to hold all words

        //STEP 2: Turn text input into a list of words
        INDArray weights;
        if(syntheticData){
            weights = Nd4j.rand(DataType.FLOAT, 250, 20);
        } else {
            log.info("Load & Vectorize data....");
            File wordFile = new ClassPathResource("deeplearning4j-tsne/words.txt").getFile();   //Open the file
            //Get the data of all unique word vectors
            Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
            VocabCache cache = vectors.getSecond();
            weights = vectors.getFirst().getSyn0();    //seperate weights of unique words into their own list

            for (int i = 0; i < cache.numWords(); i++)   //seperate strings of words into their own list
                cacheList.add(cache.wordAtIndex(i));
        }

        //STEP 3: build a dual-tree tsne to use later
        log.info("Build model....");
        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                .setMaxIter(iterations)
                .theta(0.5)
                .normalize(false)
                .learningRate(500)
                .useAdaGrad(false)
                .workspaceMode(wsm)
                .build();


        //STEP 4: establish the tsne values and save them to a file
        log.info("Store TSNE Coordinates for Plotting....");
        File outDir = testDir.newFolder();
        tsne.fit(weights);
        tsne.saveAsFile(cacheList, new File(outDir, "out.txt").getAbsolutePath());
    }
    watch.stop();
    System.out.println("Elapsed time : " + watch);
}
 
Example #28
Source File: Word2VecPerformerVoid.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public Word2VecPerformerVoid(SparkConf sc, Broadcast<AtomicLong> wordCount, InMemoryLookupTable weights) {
    this.weights = weights;
    this.wordCount = wordCount;
    setup(sc);
}
 
Example #29
Source File: Word2VecPerformerVoid.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public InMemoryLookupTable getWeights() {
    return weights;
}
 
Example #30
Source File: Word2VecPerformerVoid.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
public void setWeights(InMemoryLookupTable weights) {
    this.weights = weights;
}