org.deeplearning4j.models.embeddings.wordvectors.WordVectors Java Examples

The following examples show how to use org.deeplearning4j.models.embeddings.wordvectors.WordVectors. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CnnSentenceDataSetIterator.java    From wekaDeeplearning4j with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Constructor that uses {@link Builder} extended with stopwords.
 *
 * @param builder Builder
 */
protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) {
  super(builder);
  this.stopwords = builder.stopwords;
  setUnknownWordHandling(UnknownWordHandling.UseUnknownVector);

  // Set unknown word
  WordVectors wordVectors = getWordVectors();
  wordVectors.setUNK("UNKNOWN");

  // Initialize unknown word manually
  INDArray unknown;
  if (getUseNormalizedWordVectors()) {
    unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK());
  } else {
    unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK());
  }
  setUnknown(unknown);
}
 
Example #2
Source File: RnnTextEmbeddingDataSetIterator.java    From wekaDeeplearning4j with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Constructor with necessary objects to create RNN features.
 *
 * @param data Instances with documents and labels
 * @param wordVectors WordVectors object
 * @param tokenFact Tokenizer factory
 * @param tpp Token pre processor
 * @param stopWords Stop word object
 * @param batchSize Size of each minibatch for training
 * @param truncateLength If reviews exceed
 */
public RnnTextEmbeddingDataSetIterator(
    Instances data,
    WordVectors wordVectors,
    TokenizerFactory tokenFact,
    TokenPreProcess tpp,
    AbstractStopwords stopWords,
    LabeledSentenceProvider sentenceProvider,
    int batchSize,
    int truncateLength) {
  this.batchSize = batchSize;
  this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
  this.data = data;
  this.wordVectors = wordVectors;
  this.truncateLength = truncateLength;
  this.tokenizerFactory = tokenFact;
  this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend());
  this.stopWords = stopWords;
  this.sentenceProvider = sentenceProvider;
}
 
Example #3
Source File: CnnSentenceDataSetIterator.java    From wekaDeeplearning4j with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Constructor that uses {@link Builder} extended with stopwords.
 *
 * @param builder Builder
 */
protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) {
  super(builder);
  this.stopwords = builder.stopwords;
  setUnknownWordHandling(UnknownWordHandling.UseUnknownVector);

  // Set unknown word
  WordVectors wordVectors = getWordVectors();
  wordVectors.setUNK("UNKNOWN");

  // Initialize unknown word manually
  INDArray unknown;
  if (getUseNormalizedWordVectors()) {
    unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK());
  } else {
    unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK());
  }
  setUnknown(unknown);
}
 
Example #4
Source File: CnnWord2VecSentenceClassificationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
private static DataSetIterator getDataSetIterator(boolean isTraining, WordVectors wordVectors, int minibatchSize,
                                                  int maxSentenceLength, Random rng ){
    String path = FilenameUtils.concat(DATA_PATH, (isTraining ? "aclImdb/train/" : "aclImdb/test/"));
    String positiveBaseDir = FilenameUtils.concat(path, "pos");
    String negativeBaseDir = FilenameUtils.concat(path, "neg");

    File filePositive = new File(positiveBaseDir);
    File fileNegative = new File(negativeBaseDir);

    Map<String,List<File>> reviewFilesMap = new HashMap<>();
    reviewFilesMap.put("Positive", Arrays.asList(filePositive.listFiles()));
    reviewFilesMap.put("Negative", Arrays.asList(fileNegative.listFiles()));

    LabeledSentenceProvider sentenceProvider = new FileLabeledSentenceProvider(reviewFilesMap, rng);

    return new CnnSentenceDataSetIterator.Builder(CnnSentenceDataSetIterator.Format.CNN2D)
            .sentenceProvider(sentenceProvider)
            .wordVectors(wordVectors)
            .minibatchSize(minibatchSize)
            .maxSentenceLength(maxSentenceLength)
            .useNormalizedWordVectors(false)
            .build();
}
 
Example #5
Source File: CnnWord2VecSentenceClassificationExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
private static DataSetIterator getDataSetIterator(boolean isTraining, WordVectors wordVectors, int minibatchSize,
                                                  int maxSentenceLength, Random rng ){
    String path = FilenameUtils.concat(DATA_PATH, (isTraining ? "aclImdb/train/" : "aclImdb/test/"));
    String positiveBaseDir = FilenameUtils.concat(path, "pos");
    String negativeBaseDir = FilenameUtils.concat(path, "neg");

    File filePositive = new File(positiveBaseDir);
    File fileNegative = new File(negativeBaseDir);

    Map<String,List<File>> reviewFilesMap = new HashMap<>();
    reviewFilesMap.put("Positive", Arrays.asList(filePositive.listFiles()));
    reviewFilesMap.put("Negative", Arrays.asList(fileNegative.listFiles()));

    LabeledSentenceProvider sentenceProvider = new FileLabeledSentenceProvider(reviewFilesMap, rng);

    return new CnnSentenceDataSetIterator.Builder(CnnSentenceDataSetIterator.Format.CNN2D)
            .sentenceProvider(sentenceProvider)
            .wordVectors(wordVectors)
            .minibatchSize(minibatchSize)
            .maxSentenceLength(maxSentenceLength)
            .useNormalizedWordVectors(false)
            .build();
}
 
Example #6
Source File: RnnTextEmbeddingDataSetIterator.java    From wekaDeeplearning4j with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Constructor with necessary objects to create RNN features.
 *
 * @param data Instances with documents and labels
 * @param wordVectors WordVectors object
 * @param tokenFact Tokenizer factory
 * @param tpp Token pre processor
 * @param stopWords Stop word object
 * @param batchSize Size of each minibatch for training
 * @param truncateLength If reviews exceed
 */
public RnnTextEmbeddingDataSetIterator(
    Instances data,
    WordVectors wordVectors,
    TokenizerFactory tokenFact,
    TokenPreProcess tpp,
    AbstractStopwords stopWords,
    LabeledSentenceProvider sentenceProvider,
    int batchSize,
    int truncateLength) {
  this.batchSize = batchSize;
  this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
  this.data = data;
  this.wordVectors = wordVectors;
  this.truncateLength = truncateLength;
  this.tokenizerFactory = tokenFact;
  this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend());
  this.stopWords = stopWords;
  this.sentenceProvider = sentenceProvider;
}
 
Example #7
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #8
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
 
Example #9
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore
public void testFromTableAndVocab() throws IOException {

    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();

    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #10
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests binary file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(binaryFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #11
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests ZIP file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderArchive() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(w2v);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #12
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}
 
Example #13
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #14
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
Example #15
Source File: Windows.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #16
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #17
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method restores previously saved w2v model. File can be in one of the following formats:
 * 1) Binary model, either compressed or not. Like well-known Google Model
 * 2) Popular CSV word2vec text format
 * 3) DL4j compressed format
 *
 * In return you get StaticWord2Vec model, which might be used as lookup table only in multi-gpu environment.
 *
 * @param inputStream InputStream should point to previously saved w2v model
 * @return
 */
public static WordVectors loadStaticModel(InputStream inputStream) throws IOException {

    File tmpFile = DL4JFileUtils.createTempFile("word2vec"+System.currentTimeMillis(), ".tmp");
    FileUtils.copyInputStreamToFile(inputStream, tmpFile);
    try {
        return loadStaticModel(tmpFile);
    } finally {
        tmpFile.delete();
    }

}
 
Example #18
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method tests binary file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #19
Source File: FlatModelUtilsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
private static void printWords(String target, Collection<String> list, WordVectors vec) {
    System.out.println("Words close to [" + target + "]:");
    for (String word : list) {
        double sim = vec.similarity(target, word);
        System.out.print("'" + word + "': [" + sim + "]");
    }
    System.out.print("\n");
}
 
Example #20
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testPortugeseW2V() throws Exception {
    WordVectors word2Vec = WordVectorSerializer.loadTxtVectors(new File("/ext/Temp/para.txt"));
    word2Vec.setModelUtils(new FlatModelUtils());

    Collection<String> portu = word2Vec.wordsNearest("carro", 10);
    printWords("carro", portu, word2Vec);

    portu = word2Vec.wordsNearest("davi", 10);
    printWords("davi", portu, word2Vec);
}
 
Example #21
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
private static void printWords(String target, Collection<String> list, WordVectors vec) {
    System.out.println("Words close to [" + target + "]:");
    for (String word : list) {
        double sim = vec.similarity(target, word);
        System.out.print("'" + word + "': [" + sim + "], ");
    }
    System.out.print("\n");
}
 
Example #22
Source File: ParagraphVectors.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method allows you to use pre-built WordVectors model (e.g. Word2Vec) for ParagraphVectors.
 * Existing model will be transferred into new model before training starts.
 *
 * PLEASE NOTE: Non-normalized model is recommended to use here.
 *
 * @param vec existing WordVectors model
 * @return
 */
@Override
@SuppressWarnings("unchecked")
public Builder useExistingWordVectors(@NonNull WordVectors vec) {
    if (((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1() == null
                    && ((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1Neg() == null)
        throw new ND4JIllegalStateException("Model being passed as existing has no syn1/syn1Neg available");

    this.existingVectors = vec;
    return this;
}
 
Example #23
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method tests CSV file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #24
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderFromStream() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(new FileInputStream(binaryFile));

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
Example #25
Source File: CnnTextFilesEmbeddingInstanceIteratorTest.java    From wekaDeeplearning4j with GNU General Public License v3.0 5 votes vote down vote up
public Instances makeData() throws Exception {
  final Instances data = TestUtil.makeTestDataset(42,
      100,
      0,
      0,
      1,
      0,
      0,
      1,
      Attribute.NUMERIC,
      1,
      false);

  WordVectors wordVectors = WordVectorSerializer
      .loadStaticModel(DatasetLoader.loadGoogleNewsVectors());
  String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]);

  Random rand = new Random(42);
  for (Instance inst : data) {
    StringBuilder sentence = new StringBuilder();
    for (int i = 0; i < 10; i++) {
      final int idx = rand.nextInt(words.length);
      sentence.append(" ").append(words[idx]);
    }
    inst.setValue(0, sentence.toString());
  }
  return data;
}
 
Example #26
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testLoader() throws Exception {
    WordVectors vec = WordVectorSerializer.loadTxtVectors(new File("/home/raver119/Downloads/_vectors.txt"));

    logger.info("Rewinding: " + Arrays.toString(vec.getWordVector("rewinding")));
}
 
Example #27
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
Example #28
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLoaderBinary() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    assertEquals(vec.vocab().numWords(), 30);
    assertTrue(vec.vocab().hasToken("Morgan_Freeman"));
    assertTrue(vec.vocab().hasToken("JA_Montalbano"));
    double[] wordVector1 = vec.getWordVector("Morgan_Freeman");
    double[] wordVector2 = vec.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}
 
Example #29
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method here is only to test real google model few gigabytes worth
 * Keep it ignored, since it requirs full google model being present in system, which is 1.6gb compressed
 *
 * @throws Exception
 */
@Test
@Ignore
public void testStaticLoaderGoogleModel() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    long time1 = System.currentTimeMillis();
    WordVectors vectors = WordVectorSerializer
                    .loadStaticModel(new File("C:\\Users\\raver\\develop\\GoogleNews-vectors-negative300.bin.gz"));
    long time2 = System.currentTimeMillis();

    logger.info("Loading time: {} ms", (time2 - time1));
}
 
Example #30
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLoaderStream() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);

    assertEquals(vec.vocab().numWords(), 30);
    assertTrue(vec.vocab().hasToken("Morgan_Freeman"));
    assertTrue(vec.vocab().hasToken("JA_Montalbano"));
}