Java Code Examples for org.apache.lucene.analysis.core.WhitespaceTokenizer#setReader()

The following examples show how to use org.apache.lucene.analysis.core.WhitespaceTokenizer#setReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ConcatenateFilterTest.java From SolrTextTagger with Apache License 2.0

6 votes

public void testTypical() throws IOException {
  String NYC = "new york city";
  WhitespaceTokenizer stream = new WhitespaceTokenizer();
  stream.setReader(new StringReader(NYC));
  ConcatenateFilter filter = new ConcatenateFilter(stream);
  try {
    assertTokenStreamContents(filter, new String[]{NYC},
        new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
        new int[]{1}, null, NYC.length(), true);
  } catch (AssertionError e) {
    //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly.
    // It's manner of checking this is imperfect and incompatible with
    // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(),
    // which is weird. To the best of my ability, end() appears to be implemented correctly.
    if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()"))
      throw e;
  }
}

Example 2

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  
  CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the_s", term.toString());
  cgf.close();
  
  wt.setReader(new StringReader(input));
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
}

Example 3

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testQueryReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
  
  CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(nsf.incrementToken());
  assertEquals("the_s", term.toString());
  nsf.close();
  
  wt.setReader(new StringReader(input));
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
}

Example 4

Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0

6 votes

private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}

Example 5

Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0

6 votes

private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}

Example 6

Source File: DatasetAnalyzer.java From gerbil with GNU Affero General Public License v3.0

6 votes

private int countTokensInText(String text) {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    int tokens = 0;
    try {
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            ++tokens;
        }
    } catch (Exception e) {
        LOGGER.error("Error while tokenizing text. Returning.", e);
    } finally {
        IOUtils.closeQuietly(tokenizer);
    }
    return tokens;
}

Example 7

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}

Example 8

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testOverlappingAtEnd() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
}

Example 9

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testOverlappingAtBeginning() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}

Example 10

Source File: NGramTokenFilterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}

Example 11

Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Example 12

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testOverlappingAtEndEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("of", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
}

Example 13

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

public void testOverlappingAtBeginningEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}

Example 14

Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Example 15

Source File: EdgeNGramTokenFilterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
}

Example 16

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

4 votes

public void testAutoPhraseEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "income tax", "tax refund", "property tax"), false);

    final String input = "what is my income tax refund this year now that my property tax is so high";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("what", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax_refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("this", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("year", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("that", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("so", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("high", term.toString());
}

Example 17

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

4 votes

public void testAutoPhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList("income tax", "tax refund", "property tax"), false);

    final String input = "what is my income tax refund this year now that my property tax is so high";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);
    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();
    assertTrue(aptf.incrementToken());
    assertEquals("what", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax_refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("this", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("year", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("that", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("so", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("high", term.toString());
}

Example 18

Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");

        Map<String, String> map = new HashMap<>();


        Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }

Example 19

Source File: Zemberek2StemFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }

Example 20

Source File: ReSearcherUtils.java From solr-researcher with Apache License 2.0

4 votes

/**
 * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query.
 * 
 * @param queryString .
 * @param tokens .
 * @return number of quotes in the query
 */
public static int tokenizeQueryString(String queryString, List<String> tokens) {
  int countOfQuotes = 0;
  
  try {
    // first tokenize words and treat each quote as a separate token
    Map<String,String> args = new HashMap<String, String>();
    args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString());
    WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args);
    
    WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    s.setReader(new StringReader(queryString));
    s.reset();
    
    while (true) {
      CharTermAttribute t = s.getAttribute(CharTermAttribute.class);

      if (t == null) {
        break;
      }
      
      String tokentText = new String(t.toString());
      
      if (tokentText.equals("\"")) {
        tokens.add("\"");
        countOfQuotes++;
      } else if (tokentText.startsWith("\"")) {
        tokens.add("\"");
        countOfQuotes++;
        
        if (tokentText.endsWith("\"")) {
          tokens.add(tokentText.substring(1, tokentText.length() - 1));
          tokens.add("\"");
          countOfQuotes++;
        } else {
          tokens.add(tokentText.substring(1));
        }
      } else if (tokentText.endsWith("\"")) {
        tokens.add(tokentText.substring(0, tokentText.length() - 1));
        tokens.add("\"");
        countOfQuotes++;
      } else if (!tokentText.trim().equals("")) {
        // take into account only if different than empty string
        tokens.add(tokentText);
      }
      
      if (!s.incrementToken()) {
        break;
      }
    }
    s.end();
    s.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return countOfQuotes;
}