org.deeplearning4j.text.sentenceiterator.SentenceIterator Java Examples

The following examples show how to use org.deeplearning4j.text.sentenceiterator.SentenceIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasicLineIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("raw_sentences.txt").getFile());
    int count=0;
    while(iterator.hasNext()){
       iterator.nextSentence();
       count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }
    

}
 
Example #2
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example #3
Source File: VectorsConfigurationTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testFromW2V() throws Exception {
    VectorsConfiguration configuration = new VectorsConfiguration();
    configuration.setHugeModelExpected(true);
    configuration.setWindow(5);
    configuration.setIterations(3);
    configuration.setLayersSize(200);
    configuration.setLearningRate(1.4d);
    configuration.setSampling(0.0005d);
    configuration.setMinLearningRate(0.25d);
    configuration.setEpochs(1);

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());

    Word2Vec vec = new Word2Vec.Builder(configuration).iterate(iter).build();

    VectorsConfiguration configuration2 = vec.getConfiguration();

    assertEquals(configuration, configuration2);
}
 
Example #4
Source File: PerformanceTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}
 
Example #5
Source File: Word2VecIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File dir = testDir.newFolder();
        resource.copyDirectory(dir);
        SentenceIterator iter = UimaSentenceIterator.createWithPath(dir.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}
 
Example #6
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example #7
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testVocab() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    Set<String> set = new HashSet<>();
    int lines = 0;
    int cnt = 0;
    while (iter.hasNext()) {
        Tokenizer tok = t.create(iter.nextSentence());
        for (String token : tok.getTokens()) {
            if (token == null || token.isEmpty() || token.trim().isEmpty())
                continue;
            cnt++;

            if (!set.contains(token))
                set.add(token);
        }

        lines++;
    }

    log.info("Total number of tokens: [" + cnt + "], lines: [" + lines + "], set size: [" + set.size() + "]");
    log.info("Set:\n" + set);
}
 
Example #8
Source File: VocabConstructorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout=5000)		// 5s timeout
public void testParallelTokenizationDisabled_Completes() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iter).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
            new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> constructor = new VocabConstructor.Builder<VocabWord>().addSource(sequenceIterator, 5)
            .allowParallelTokenization( false)
            .build();

    constructor.buildJointVocabulary(false, true);
}
 
Example #9
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void hasNext() throws Exception {
    SentenceIterator iterator = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
                    .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();
    int cnt = 0;
    Sequence<VocabWord> sequence = null;
    while (iter.hasNext()) {
        sequence = iter.next();
        assertNotEquals("Failed on [" + cnt + "] iteration", null, sequence);
        assertNotEquals("Failed on [" + cnt + "] iteration", 0, sequence.size());
        cnt++;
    }

    //   log.info("Last element: {}", sequence.asLabels());

    assertEquals(97162, cnt);
}
 
Example #10
Source File: BasicLineIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 6 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new BasicLineIterator(new ClassPathResource("raw_sentences.txt").getFile());
    int count=0;
    while(iterator.hasNext()){
       iterator.nextSentence();
       count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }
    

}
 
Example #11
Source File: AsyncLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void nextDocument() throws Exception {
    SentenceIterator sentence = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));
    BasicLabelAwareIterator backed = new BasicLabelAwareIterator.Builder(sentence).build();

    int cnt = 0;
    while (backed.hasNextDocument()) {
        backed.nextDocument();
        cnt++;
    }
    assertEquals(97162, cnt);

    backed.reset();

    AsyncLabelAwareIterator iterator = new AsyncLabelAwareIterator(backed, 64);
    cnt = 0;
    while (iterator.hasNext()) {
        iterator.next();
        cnt++;

        if (cnt == 10)
            iterator.reset();
    }
    assertEquals(97172, cnt);
}
 
Example #12
Source File: BasicLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testHasNextDocument1() throws Exception {

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();

    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    LabelsSource generator = iterator.getLabelsSource();

    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
 
Example #13
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example #14
Source File: Word2VecDataSetIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }
 
Example #15
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testCompletes_WhenIteratorHasOneElement() throws Exception {

    String testString = "";
    String[] stringsArray = new String[100];
    for (int i = 0; i < 100; ++i) {
        testString += Integer.toString(i) + " ";
        stringsArray[i] = Integer.toString(i);
    }
    InputStream inputStream = IOUtils.toInputStream(testString, "UTF-8");
    SentenceIterator iterator = new BasicLineIterator(inputStream);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();

    Sequence<VocabWord> sequence = null;
    int cnt = 0;
    while (iter.hasNext()) {
        sequence = iter.next();
        List<VocabWord> words = sequence.getElements();
        for (VocabWord word : words) {
            assertEquals(stringsArray[cnt], word.getWord());
            ++cnt;
        }
    }

}
 
Example #16
Source File: ParallelTransformerIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void orderIsStableForParallelTokenization() throws Exception {

    String[] stringsArray = new String[1000];
    String testStrings = "";
    for (int i = 0; i < 1000; ++i) {
        stringsArray[i] = Integer.toString(i);
        testStrings += Integer.toString(i) + "\n";
    }
    InputStream inputStream = IOUtils.toInputStream(testStrings, "UTF-8");
    SentenceIterator iterator = new BasicLineIterator(inputStream);

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    Iterator<Sequence<VocabWord>> iter = transformer.iterator();

    Sequence<VocabWord> sequence = null;
    int cnt = 0;
    while (iter.hasNext()) {
        sequence = iter.next();
        List<VocabWord> words = sequence.getElements();
        for (VocabWord word : words) {
            assertEquals(stringsArray[cnt], word.getWord());
            ++cnt;
        }
    }

}
 
Example #17
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsPartiallyAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(3, matrix.rows());
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));
}
 
Example #18
Source File: FastTextTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLoadIterator() throws FileNotFoundException {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
     FastText
            .builder()
            .supervised(true)
            .iterator(iter)
            .build()
            .loadIterator();
}
 
Example #19
Source File: Word2Vec.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method defines SentenceIterator instance, that will be used as training corpus source
 *
 * @param iterator SentenceIterator instance
 */
public void setSentenceIterator(@NonNull SentenceIterator iterator) {
    //if (tokenizerFactory == null) throw new IllegalStateException("Please call setTokenizerFactory() prior to setSentenceIter() call.");

    if (tokenizerFactory != null) {
        SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator)
                        .tokenizerFactory(tokenizerFactory)
                        .allowMultithreading(configuration == null || configuration.isAllowParallelTokenization())
                        .build();
        this.iterator = new AbstractSequenceIterator.Builder<>(transformer).build();
    } else
        log.error("Please call setTokenizerFactory() prior to setSentenceIter() call.");
}
 
Example #20
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
Example #21
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordsNearestSum() throws IOException {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("Load & Vectorize Sentences....");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder()
            .minWordFrequency(5)
            .iterations(1)
            .layerSize(100)
            .seed(42)
            .windowSize(5)
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("Writing word vectors to text file....");
    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearestSum("day", 10);
    log.info("10 Words closest to 'day': {}", lst);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(lst.contains("years"));
    assertTrue(lst.contains("time"));
}
 
Example #22
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels_WithUnknown() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            //.negativeSample(10)
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(4)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t)
            .useUnknown(true).unknownElement(new VocabWord(1, "UNKOWN")).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("bus");
    labels.add("car");

    INDArray matrix = vec.getWordVectors(labels);
    for (int i = 0; i < labels.size(); ++i)
        assertEquals(matrix.getRow(i, true), vec.getWordVectorMatrix("UNKNOWN"));
}
 
Example #23
Source File: BasicLabelAwareIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testHasNextDocument2() throws Exception {

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    BasicLabelAwareIterator iterator = new BasicLabelAwareIterator.Builder(iter).setLabelTemplate("DOCZ_").build();

    int cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    iterator.reset();

    cnt = 0;
    while (iterator.hasNextDocument()) {
        iterator.nextDocument();
        cnt++;
    }

    assertEquals(97162, cnt);

    LabelsSource generator = iterator.getLabelsSource();

    // this is important moment. Iterator after reset should not increase number of labels attained
    assertEquals(97162, generator.getLabels().size());
    assertEquals("DOCZ_0", generator.getLabels().get(0));
}
 
Example #24
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordVectorsAbsentLabels() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
            .iterations(1).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
            .useHierarchicSoftmax(true).allowParallelTokenization(true)
            .useUnknown(false)
            .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("fewfew");

    INDArray matrix = vec.getWordVectors(labels);
    assertTrue(matrix.isEmpty());
}
 
Example #25
Source File: Dl4jStringToWord2Vec.java    From wekaDeeplearning4j with GNU General Public License v3.0 5 votes vote down vote up
@Override
void initiliazeVectors(Instances instances) {
  SentenceIterator iter = new WekaInstanceSentenceIterator(instances, this.textIndex - 1);

  // sets the tokenizer
  this.tokenizerFactory.getBackend().setTokenPreProcessor(this.preprocessor.getBackend());

  // initializes stopwords
  this.stopWordsHandler.initialize();

  // Building model
  this.vec =
      new Word2Vec.Builder()
          .minWordFrequency(this.minWordFrequency)
          .useAdaGrad(this.useAdaGrad)
          .allowParallelTokenization(this.allowParallelTokenization)
          .enableScavenger(this.enableScavenger)
          .negativeSample(this.negativeSamplingValue)
          .sampling(this.subSamplingThres)
          .epochs(this.epochs)
          .learningRate(this.learningRate)
          .minLearningRate(this.minLearningRate)
          .workers(this.workers)
          .iterations(this.iterations)
          .layerSize(this.layerSize)
          .seed(this.seed)
          .windowSize(this.windowSize)
          .iterate(iter)
          .stopWords(this.stopWordsHandler.getStopList())
          .tokenizerFactory(this.tokenizerFactory.getBackend())
          .build();

  // fit model
  this.vec.fit();
}
 
Example #26
Source File: UimaSentenceIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    SentenceIterator iterator = UimaSentenceIterator.createWithPath("files/");
    int count=0;
    while(iterator.hasNext()){
        iterator.nextSentence();
        count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }

}
 
Example #27
Source File: LineSentenceIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences.txt").getFile());
    int count=0;
    while(iterator.hasNext()){
        iterator.nextSentence();
        count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }

}
 
Example #28
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWord2VecCBOW() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(4)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
 
Example #29
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example #30
Source File: FileSentenceIteratorExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException {
    SentenceIterator iterator = new FileSentenceIterator(new ClassPathResource("files/").getFile());
    int count=0;
    while(iterator.hasNext()){
        iterator.nextSentence();
        count++;
    }
    System.out.println("count = "+count);
    iterator.reset();
    SentenceDataPreProcessor.setPreprocessor(iterator);
    while(iterator.hasNext()){
        System.out.println(iterator.nextSentence());
    }
}