org.deeplearning4j.text.sentenceiterator.BasicLineIterator Java Examples

The following examples show how to use org.deeplearning4j.text.sentenceiterator.BasicLineIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasicLineIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("raw_sentences.txt").getFile());
    int count=0;
    while(iterator.hasNext()){
       iterator.nextSentence();
       count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }
    

}
 
Example #2
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example #3
Source File: BasicLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testHasNextDocument1() throws Exception {

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();

    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    LabelsSource generator = iterator.getLabelsSource();

    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
 
Example #4
Source File: AsyncLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void nextDocument() throws Exception {
    SentenceIterator sentence = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));
    BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();

    int cnt = 0;
    while (backed.hasNextDocument()) {
        backed.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);

    backed.reset();

    AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
    cnt = 0;
    while (iterator.hasNext()) {
        iterator.next();
        cnt++;

        if (cnt == 10)
            iterator.reset();
    }
    assertEquals(97172, cnt);
}
 
Example #5
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void hasNext() throws Exception {
    SentenceIterator iterator = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
                    .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    Sequence<VocabWord> sequence = null;
    while (iter.hasNext()) {
        sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }

    //   log.info("Last element: {}", sequence.asLabels());

    assertEquals(97162, cnt);
}
 
Example #6
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout=5000)		// 5s timeout
public void testParallelTokenizationDisabled_Completes() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
            new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 5)
            .allowParallelTokenization( false)
            .build();

    constructor.buildJointVocabulary(false, true);
}
 
Example #7
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testVocab() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    Set<String> set = new HashSet<>();
    int lines = 0;
    int cnt = 0;
    while (iter.hasNext()) {
        Tokenizer tok = t.create(iter.nextSentence());
        for (String token : tok.getTokens()) {
            if (token == null || token.isEmpty() || token.trim().isEmpty())
                continue;
            cnt++;

            if (!set.contains(token))
                set.add(token);
        }

        lines++;
    }

    log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
    log.info("Set:\n" + set);
}
 
Example #8
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example #9
Source File: BasicLineIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("raw_sentences.txt").getFile());
    int count=0;
    while(iterator.hasNext()){
       iterator.nextSentence();
       count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }
    

}
 
Example #10
Source File: PerformanceTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}
 
Example #11
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example #12
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsPartiallyAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(3, matrix.rows());
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));
}
 
Example #13
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");

    INDArray matrix = vec.getWordVectors(labels);
    assertTrue(matrix.isEmpty());
}
 
Example #14
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels_WithUnknown() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            //.negativeSample(10)
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(4)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t)
            .useUnknown(true).unknownElement(new VocabWord(1, "UNKOWN")).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("bus");
    labels.add("car");

    INDArray matrix = vec.getWordVectors(labels);
    for (int i = 0; i < labels.size(); ++i)
        assertEquals(matrix.getRow(i, true), vec.getWordVectorMatrix("UNKNOWN"));
}
 
Example #15
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordsNearestSum() throws IOException {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("Load & Vectorize Sentences....");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder()
            .minWordFrequency(5)
            .iterations(1)
            .layerSize(100)
            .seed(42)
            .windowSize(5)
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("Writing word vectors to text file....");
    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearestSum("day", 10);
    log.info("10 Words closest to 'day': {}", lst);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(lst.contains("years"));
    assertTrue(lst.contains("time"));
}
 
Example #16
Source File: BasicLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testHasNextDocument2() throws Exception {

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();

    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    iterator.reset();

    cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    LabelsSource generator = iterator.getLabelsSource();

    // this is important moment. Iterator after reset should not increase number of labels attained
    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
 
Example #17
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void orderIsStableForParallelTokenization() throws Exception {

    String[] stringsArray = new String[1000];
    String testStrings = "";
    for (int i = 0; i < 1000; ++i) {
        stringsArray[i] = Integer.toString(i);
        testStrings += Integer.toString(i) + "\n";
    }
    InputStream inputStream = IOUtils.toInputStream(testStrings, "UTF-8");
    SentenceIterator iterator = new BasicLineIterator(inputStream);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();

    Sequence<VocabWord> sequence = null;
    int cnt = 0;
    while (iter.hasNext()) {
        sequence = iter.next();
        List<VocabWord> words = sequence.getElements();
        for (VocabWord word : words) {
            assertEquals(stringsArray[cnt], word.getWord());
            ++cnt;
        }
    }

}
 
Example #18
Source File: FastTextTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLoadIterator() throws FileNotFoundException {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
     FastText
            .builder()
            .supervised(true)
            .iterator(iter)
            .build()
            .loadIterator();
}
 
Example #19
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCompletes_WhenIteratorHasOneElement() throws Exception {

    String testString = "";
    String[] stringsArray = new String[100];
    for (int i = 0; i < 100; ++i) {
        testString += Integer.toString(i) + " ";
        stringsArray[i] = Integer.toString(i);
    }
    InputStream inputStream = IOUtils.toInputStream(testString, "UTF-8");
    SentenceIterator iterator = new BasicLineIterator(inputStream);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();

    Sequence<VocabWord> sequence = null;
    int cnt = 0;
    while (iter.hasNext()) {
        sequence = iter.next();
        List<VocabWord> words = sequence.getElements();
        for (VocabWord word : words) {
            assertEquals(stringsArray[cnt], word.getWord());
            ++cnt;
        }
    }

}
 
Example #20
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore // no adagrad these days
public void testWord2VecAdaGrad() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(5).learningRate(0.025).layerSize(100)
                    .seed(42).batchSize(13500).sampling(0).negativeSample(0)
                    //.epochs(10)
                    .windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false)
                    .useHierarchicSoftmax(true).iterate(iter).workers(4).tokenizerFactory(t).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
}
 
Example #21
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWord2VecCBOW() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(4)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
 
Example #22
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testBuildJointVocabulary1() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();


    /*
        And we pack that transformer into AbstractSequenceIterator
     */
    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 0).useAdaGrad(false).setTargetVocabCache(cache).build();

    constructor.buildJointVocabulary(true, false);


    assertEquals(244, cache.numWords());

    assertEquals(0, cache.totalWordOccurrences());
}
 
Example #23
Source File: ClassifyBySimilarity.java    From Java-for-Data-Science with MIT License 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
    File file = srcFile.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    
    TokenizerFactory tFact = new DefaultTokenizerFactory();
    tFact.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource labelFormat = new LabelsSource("LINE_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(5)
            .epochs(1)
            .layerSize(100)
            .learningRate(0.025)
            .labelsSource(labelFormat)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(false)
            .tokenizerFactory(tFact)
            .sampling(0)
            .build();

    vec.fit();

    double similar1 = vec.similarity("LINE_9835", "LINE_12492");
    out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);


    double similar2 = vec.similarity("LINE_3720", "LINE_16392");
    out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);

    double similar3 = vec.similarity("LINE_6347", "LINE_3720");
    out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);

    double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
    out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
    
    double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
    out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
    
    
    
}
 
Example #24
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void reproducibleResults_ForMultipleRuns() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("reproducibleResults_ForMultipleRuns");
    val shakespear = new ClassPathResource("big/rnj.txt");
    val basic = new ClassPathResource("big/rnj.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec1 = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(1)
            .useHierarchicSoftmax(true)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    Word2Vec vec2 = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(1)
            .useHierarchicSoftmax(true)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec1.fit();

    iter.reset();

    vec2.fit();

    for (int e = 0; e < vec1.getVocab().numWords(); e++) {
        val w1 = vec1.getVocab().elementAtIndex(e);
        val w2 = vec2.getVocab().elementAtIndex(e);

        assertNotNull(w1);
        assertNotNull(w2);

        assertEquals(w1.getLabel(), w2.getLabel());

        assertArrayEquals("Failed for token [" + w1.getLabel() + "] at index [" + e + "]", Ints.toArray(w1.getPoints()), Ints.toArray(w2.getPoints()));
        assertArrayEquals("Failed for token [" + w1.getLabel() + "] at index [" + e + "]", Ints.toArray(w1.getCodes()), Ints.toArray(w2.getCodes()));
    }

    val syn0_from_vec1 = ((InMemoryLookupTable<VocabWord>) vec1.getLookupTable()).getSyn0();
    val syn0_from_vec2 = ((InMemoryLookupTable<VocabWord>) vec2.getLookupTable()).getSyn0();

    assertEquals(syn0_from_vec1, syn0_from_vec2);

    log.info("Day/night similarity: {}", vec1.similarity("day", "night"));
    val result = vec1.wordsNearest("day", 10);
    printWords("day", result, vec1);
}
 
Example #25
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRunWord2Vec() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    // Strip white space before and after for each line
    /*val shakespear = new ClassPathResource("big/rnj.txt");
    SentenceIterator iter = new BasicLineIterator(shakespear.getFile());*/
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
                    //.negativeSample(10)
                    .epochs(1).windowSize(5).allowParallelTokenization(true)
                    .workers(6)
                    .usePreciseMode(true)
                    .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    File tempFile = File.createTempFile("temp", "temp");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
    Collection<String> lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(sim < 1.0);
    assertTrue(sim > 0.4);


    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    assertFalse(lst.contains(null));


    lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    new File("cache.ser").delete();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));

    WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
 
Example #26
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
 
Example #27
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testUnknown1() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
                    .unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
                    .useHierarchicSoftmax(true).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    assertTrue(vec.hasWord("PEWPEW"));
    assertTrue(vec.getVocab().containsWord("PEWPEW"));

    INDArray unk = vec.getWordVectorMatrix("PEWPEW");
    assertNotEquals(null, unk);

    File tempFile = File.createTempFile("temp", "file");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    log.info("Original configuration: {}", vec.getConfiguration());

    Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);

    assertTrue(restored.hasWord("PEWPEW"));
    assertTrue(restored.getVocab().containsWord("PEWPEW"));
    INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");

    assertEquals(unk, unk_restored);



    // now we're getting some junk word
    INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
    INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");

    log.info("Restored configuration: {}", restored.getConfiguration());

    assertEquals(unk, random);
    assertEquals(unk, randomRestored);
}
 
Example #28
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
/**
 * Here we test basic vocab transfer, done WITHOUT labels
 * @throws Exception
 */
@Test
public void testMergedVocab1() throws Exception {
    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();

    AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();

    File resource = Resources.asFile("big/raw_sentences.txt");

    BasicLineIterator underlyingIterator = new BasicLineIterator(resource);


    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();

    vocabConstructor.buildJointVocabulary(false, true);

    int sourceSize = cacheSource.numWords();
    log.info("Source Vocab size: " + sourceSize);


    VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();

    vocabTransfer.buildMergedVocabulary(cacheSource, false);

    assertEquals(sourceSize, cacheTarget.numWords());
}
 
Example #29
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
@Test
public void testBuildJointVocabulary2() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    VocabCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();


    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 5).useAdaGrad(false).setTargetVocabCache(cache).build();

    constructor.buildJointVocabulary(false, true);

    //        assertFalse(cache.hasToken("including"));

    assertEquals(242, cache.numWords());


    assertEquals("i", cache.wordAtIndex(1));
    assertEquals("it", cache.wordAtIndex(0));

    assertEquals(634303, cache.totalWordOccurrences());
}
 
Example #30
Source File: SequenceVectorsTest.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
@Test
public void testInternalVocabConstruction() throws Exception {
    ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
    File file = resource.getFile();

    BasicLineIterator underlyingIterator = new BasicLineIterator(file);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration())
                    .minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1)
                    .resetModel(false).trainElementsRepresentation(true).build();


    logger.info("Fitting model...");

    vectors.fit();

    logger.info("Model ready...");

    double sim = vectors.similarity("day", "night");
    logger.info("Day/night similarity: " + sim);
    assertTrue(sim > 0.6d);

    Collection<String> labels = vectors.wordsNearest("day", 10);
    logger.info("Nearest labels to 'day': " + labels);
}