org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory Java Examples

The following examples show how to use org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testBertWordPieceTokenizer1() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);

        String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
        assertEquals(toTokenize, s2);
    }
}
 
Example #2
Source File: PerformanceTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}
 
Example #3
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTokens() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory();

    Tokenizer tokenizer = tf.create(toTokenize);

    // Exhaust iterator.
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }

    // Ensure exhausted.
    assertEquals(false, tokenizer.hasMoreTokens());

    // Count doesn't change.
    assertEquals(expect.length, tokenizer.countTokens());

    // getTokens still returns everything.
    List<String> tokens = tokenizer.getTokens();
    assertEquals(expect.length, tokens.size());
}
 
Example #4
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example #5
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore("AB 2019/05/24 - Disabled until dev branch merged - see issue #7657")
public void testBertWordPieceTokenizer5() throws Exception {
    // Longest Token in Vocab is 22 chars long, so make sure splits on the edge are properly handled
    String toTokenize = "Donaudampfschifffahrts Kapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##s", "Kapitän", "##sm", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example #6
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example #7
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultStreamTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    assertEquals(5, tokenizer2.countTokens());

    int cnt = 0;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer2.nextToken();
        log.info(tok1);
        cnt++;
    }

    assertEquals(5, cnt);
}
 
Example #8
Source File: Word2VecIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File dir = testDir.newFolder();
        resource.copyDirectory(dir);
        SentenceIterator iter = UimaSentenceIterator.createWithPath(dir.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}
 
Example #9
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }


    System.out.println("-----------------------------------------------");

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();

    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
                    + Math.abs(stringCount - stringCount2));

    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example #10
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer1() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }


    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example #11
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
Example #12
Source File: DefaultDocumentIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDocumentIterator() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();

    DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());

    InputStream doc = iter.nextDocument();

    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer next = t.create(doc);
    String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
    ///PEARSON CONCENTRATES ON FOUR SECTORS
    int count = 0;
    while (next.hasMoreTokens() && count < list.length) {
        String token = next.nextToken();
        assertEquals(list[count++], token);
    }


    doc.close();
}
 
Example #13
Source File: WordVectorSerializer.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
protected static TokenizerFactory getTokenizerFactory(VectorsConfiguration configuration) {
    if (configuration == null)
        return null;

    if (configuration.getTokenizerFactory() != null && !configuration.getTokenizerFactory().isEmpty()) {
        try {
            TokenizerFactory factory =
                            (TokenizerFactory) Class.forName(configuration.getTokenizerFactory()).newInstance();

            if (configuration.getTokenPreProcessor() != null && !configuration.getTokenPreProcessor().isEmpty()) {
                TokenPreProcess preProcessor =
                                (TokenPreProcess) Class.forName(configuration.getTokenPreProcessor()).newInstance();
                factory.setTokenPreProcessor(preProcessor);
            }

            return factory;

        } catch (Exception e) {
            log.error("Can't instantiate saved TokenizerFactory: {}", configuration.getTokenizerFactory());
        }
    }
    return null;
}
 
Example #14
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #15
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testParallelIterator() throws IOException {
    TokenizerFactory factory = new DefaultTokenizerFactory();
    SentenceIterator iterator = new BasicLineIterator(Resources.asFile("big/raw_sentences.txt"));

    SentenceTransformer transformer = new SentenceTransformer.Builder().iterator(iterator).allowMultithreading(true)
            .tokenizerFactory(factory).build();

    BasicTransformerIterator iter = (BasicTransformerIterator)transformer.iterator();
    for (int i = 0; i < 100; ++i) {
        int cnt = 0;
        long counter = 0;
        Sequence<VocabWord> sequence = null;
        while (iter.hasNext()) {
            sequence = iter.next();
            counter += sequence.size();
            cnt++;
        }
        iter.reset();
        assertEquals(757172, counter);
    }
}
 
Example #16
Source File: Windows.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example #17
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBaseForm() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory(true);

    Tokenizer tokenizer1 = tf.create(toTokenize);
    Tokenizer tokenizer2 = tf.create(baseString);

    assertEquals("黒い", tokenizer1.nextToken());
    assertEquals("驚く", tokenizer2.nextToken());
}
 
Example #18
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testJapaneseTokenizer() throws Exception {
    TokenizerFactory t = new JapaneseTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);

    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
Example #19
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultTokenizer3() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }
}
 
Example #20
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBertWordPieceTokenizer2() throws Exception {
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example #21
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBertWordPieceTokenizer3() throws Exception {
    String toTokenize = "Donaudampfschifffahrtskapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##skap", "##itä", "##ns", "##m", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example #22
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBertWordPieceTokenizer4() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("I", "saw", "a", "girl", "with", "a", "tele", "##scope", ".");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example #23
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example #24
Source File: Word2VecDataSetIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }
 
Example #25
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 300000)
public void testDoubleFit() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = getIterator(isIntegration, resource);


    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    val builder = new ParagraphVectors.Builder();
    ParagraphVectors vec = builder.minWordFrequency(1).iterations(5).seed(119).epochs(1)
            .layerSize(150).learningRate(0.025).labelsSource(source).windowSize(5)
            .sequenceLearningAlgorithm(new DM<VocabWord>()).iterate(iter).trainWordVectors(true)
            .usePreciseWeightInit(true)
            .batchSize(8192)
            .allowParallelTokenization(false)
            .tokenizerFactory(t).workers(1).sampling(0).build();

    vec.fit();
    long num1 = vec.vocab().totalNumberOfDocs();

    vec.fit();
    System.out.println(vec.vocab().totalNumberOfDocs());
    long num2 = vec.vocab().totalNumberOfDocs();

    assertEquals(num1, num2);
}
 
Example #26
Source File: Word2Vec.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Specifies TokenizerFactory to be used for tokenization
 *
 * PLEASE NOTE: You can't use anonymous implementation here
 *
 * @param factory
 * @return
 */
public Builder tokenizerFactory(@NonNull TokenizerFactory factory) {
    this.tokenizer = factory.getClass().getCanonicalName();

    if (factory.getTokenPreProcessor() != null) {
        this.tokenPreprocessor = factory.getTokenPreProcessor().getClass().getCanonicalName();
    } else {
        this.tokenPreprocessor = "";
    }

    return this;
}
 
Example #27
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677
public void testDirectInference() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator sentencesIter = getIterator(isIntegration, resource);

    ClassPathResource resource_mixed = new ClassPathResource("paravec/");
    File local_resource_mixed = testDir.newFolder();
    resource_mixed.copyDirectory(local_resource_mixed);
    SentenceIterator iter = new AggregatingSentenceIterator.Builder()
                    .addSentenceIterator(sentencesIter)
                    .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1)
                    .learningRate(0.025).layerSize(150).minLearningRate(0.001)
                    .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    wordVectors.fit();

    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10)
                    .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors)
                    .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example #28
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
            .set("spark.driver.host", "localhost")
                    .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
                    .set("spark.executor.memory", "8g");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    //String dataPath = Resources.asFile("big/raw_sentences.txt").getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .useUnknown(true).build();

    word2Vec.train(corpus);


    sc.stop();

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
 
Example #29
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}
 
Example #30
Source File: SparkWord2Vec.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * This method defines tokenizer htat will be used for corpus tokenization
 *
 * @param tokenizerFactory
 * @return
 */
public Builder setTokenizerFactory(@NonNull TokenizerFactory tokenizerFactory) {
    configuration.setTokenizerFactory(tokenizerFactory.getClass().getCanonicalName());
    if (tokenizerFactory.getTokenPreProcessor() != null)
        configuration.setTokenPreProcessor(
                        tokenizerFactory.getTokenPreProcessor().getClass().getCanonicalName());

    return this;
}