Java Code Examples for org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#create()

The following examples show how to use org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetTokens() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory();

    Tokenizer tokenizer = tf.create(toTokenize);

    // Exhaust iterator.
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }

    // Ensure exhausted.
    assertEquals(false, tokenizer.hasMoreTokens());

    // Count doesn't change.
    assertEquals(expect.length, tokenizer.countTokens());

    // getTokens still returns everything.
    List<String> tokens = tokenizer.getTokens();
    assertEquals(expect.length, tokens.size());
}
 
Example 2
Source File: Windows.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example 3
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore("AB 2019/05/24 - Disabled until dev branch merged - see issue #7657")
public void testBertWordPieceTokenizer5() throws Exception {
    // Longest Token in Vocab is 22 chars long, so make sure splits on the edge are properly handled
    String toTokenize = "Donaudampfschifffahrts Kapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##s", "Kapitän", "##sm", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example 4
Source File: DefaultDocumentIteratorTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDocumentIterator() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();

    DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());

    InputStream doc = iter.nextDocument();

    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer next = t.create(doc);
    String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
    ///PEARSON CONCENTRATES ON FOUR SECTORS
    int count = 0;
    while (next.hasMoreTokens() && count < list.length) {
        String token = next.nextToken();
        assertEquals(list[count++], token);
    }


    doc.close();
}
 
Example 5
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer1() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }


    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example 6
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }


    System.out.println("-----------------------------------------------");

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();

    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
                    + Math.abs(stringCount - stringCount2));

    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
Example 7
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testDefaultStreamTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    assertEquals(5, tokenizer2.countTokens());

    int cnt = 0;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer2.nextToken();
        log.info(tok1);
        cnt++;
    }

    assertEquals(5, cnt);
}
 
Example 8
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testBertWordPieceTokenizer1() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);

        String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
        assertEquals(toTokenize, s2);
    }
}
 
Example 9
Source File: DefaulTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultTokenizer3() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }
}
 
Example 10
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBaseForm() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory(true);

    Tokenizer tokenizer1 = tf.create(toTokenize);
    Tokenizer tokenizer2 = tf.create(baseString);

    assertEquals("黒い", tokenizer1.nextToken());
    assertEquals("驚く", tokenizer2.nextToken());
}
 
Example 11
Source File: JapaneseTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testJapaneseTokenizer() throws Exception {
    TokenizerFactory t = new JapaneseTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);

    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
Example 12
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBertWordPieceTokenizer4() throws Exception {
    String toTokenize = "I saw a girl with a telescope.";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("I", "saw", "a", "girl", "with", "a", "tele", "##scope", ".");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example 13
Source File: BertWordPieceTokenizerTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testBertWordPieceTokenizer3() throws Exception {
    String toTokenize = "Donaudampfschifffahrtskapitänsmützeninnenfuttersaum";
    TokenizerFactory t = new BertWordPieceTokenizerFactory(pathToVocab, false, false, c);
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    final List<String> expected = Arrays.asList("Donau", "##dam", "##pf", "##schiff", "##fahrt", "##skap", "##itä", "##ns", "##m", "##ützen", "##innen", "##fu", "##tter", "##sa", "##um");
    assertEquals(expected, tokenizer.getTokens());
    assertEquals(expected, tokenizer2.getTokens());

    String s2 = BertWordPiecePreProcessor.reconstructFromTokens(tokenizer.getTokens());
    assertEquals(toTokenize, s2);
}
 
Example 14
Source File: KoreanTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testKoreanTokenizer() throws Exception {
    String toTokenize = "세계 최초의 상용 수준 오픈소스 딥러닝 라이브러리입니다";
    TokenizerFactory t = new KoreanTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    String[] expect = {"세계", "최초", "의", "상용", "수준", "오픈소스", "딥", "러닝", "라이브러리", "입니", "다"};

    assertEquals(expect.length, tokenizer.countTokens());

    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
Example 15
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, TokenizerFactory tokenizerFactory) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());
    return windows(list, 5);
}
 
Example 16
Source File: Windows.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(InputStream words, TokenizerFactory tokenizerFactory, int windowSize) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens())
        list.add(tokenizer.nextToken());

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
Example 17
Source File: ChineseTokenizerTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testChineseTokenizer() {
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
Example 18
Source File: ContextLabelRetriever.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * Returns a stripped sentence with the indices of words
 * with certain kinds of labels.
 * @param sentence the sentence to process
 * @return a pair of a post processed sentence
 * with labels stripped and the spans of
 * the labels
 */
public static Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringWithLabels(String sentence,
                                                               TokenizerFactory tokenizerFactory) {
    MultiDimensionalMap<Integer, Integer, String> map = MultiDimensionalMap.newHashBackedMap();
    Tokenizer t = tokenizerFactory.create(sentence);
    List<String> currTokens = new ArrayList<>();
    String currLabel = null;
    String endLabel = null;
    List<Pair<String, List<String>>> tokensWithSameLabel = new ArrayList<>();
    while (t.hasMoreTokens()) {
        String token = t.nextToken();
        if (token.matches(BEGIN_LABEL)) {
            if (endLabel != null)
                throw new IllegalStateException(
                                "Tried parsing sentence; found an end label when the begin label has not been cleared");
            currLabel = token;

            //no labels; add these as NONE and begin the new label
            if (!currTokens.isEmpty()) {
                tokensWithSameLabel.add(new Pair<>("NONE", (List<String>) new ArrayList<>(currTokens)));
                currTokens.clear();

            }

        } else if (token.matches(END_LABEL)) {
            if (currLabel == null)
                throw new IllegalStateException("Found an ending label with no matching begin label");
            endLabel = token;
        } else
            currTokens.add(token);

        if (currLabel != null && endLabel != null) {
            currLabel = currLabel.replaceAll("[<>/]", "");
            endLabel = endLabel.replaceAll("[<>/]", "");
            Preconditions.checkState(!currLabel.isEmpty(), "Current label is empty!");
            Preconditions.checkState(!endLabel.isEmpty(), "End label is empty!");
            Preconditions.checkState(currLabel.equals(endLabel), "Current label begin and end did not match for the parse. Was: %s ending with %s", currLabel, endLabel);

            tokensWithSameLabel.add(new Pair<>(currLabel, (List<String>) new ArrayList<>(currTokens)));
            currTokens.clear();

            //clear out the tokens
            currLabel = null;
            endLabel = null;
        }
    }

    //no labels; add these as NONE and begin the new label
    if (!currTokens.isEmpty()) {
        tokensWithSameLabel.add(new Pair<>("none", (List<String>) new ArrayList<>(currTokens)));
        currTokens.clear();

    }

    //now join the output
    StringBuilder strippedSentence = new StringBuilder();
    for (Pair<String, List<String>> tokensWithLabel : tokensWithSameLabel) {
        String joinedSentence = StringUtils.join(tokensWithLabel.getSecond(), " ");
        //spaces between separate parts of the sentence
        if (!(strippedSentence.length() < 1))
            strippedSentence.append(" ");
        strippedSentence.append(joinedSentence);
        int begin = strippedSentence.toString().indexOf(joinedSentence);
        int end = begin + joinedSentence.length();
        map.put(begin, end, tokensWithLabel.getFirst());
    }

    return new Pair<>(strippedSentence.toString(), map);
}