Java Code Examples for org.apache.lucene.analysis.core.WhitespaceTokenizer#setReader()

The following examples show how to use org.apache.lucene.analysis.core.WhitespaceTokenizer#setReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ConcatenateFilterTest.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
public void testTypical() throws IOException {
  String NYC = "new york city";
  WhitespaceTokenizer stream = new WhitespaceTokenizer();
  stream.setReader(new StringReader(NYC));
  ConcatenateFilter filter = new ConcatenateFilter(stream);
  try {
    assertTokenStreamContents(filter, new String[]{NYC},
        new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
        new int[]{1}, null, NYC.length(), true);
  } catch (AssertionError e) {
    //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly.
    // It's manner of checking this is imperfect and incompatible with
    // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(),
    // which is weird. To the best of my ability, end() appears to be implemented correctly.
    if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()"))
      throw e;
  }
}
 
Example 2
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  
  CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the_s", term.toString());
  cgf.close();
  
  wt.setReader(new StringReader(input));
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
}
 
Example 3
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testQueryReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
  
  CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(nsf.incrementToken());
  assertEquals("the_s", term.toString());
  nsf.close();
  
  wt.setReader(new StringReader(input));
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
}
 
Example 4
Source File: XmlInterpolationTest.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example 5
Source File: XmlInterpolationTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example 6
Source File: DatasetAnalyzer.java    From gerbil with GNU Affero General Public License v3.0 6 votes vote down vote up
private int countTokensInText(String text) {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    int tokens = 0;
    try {
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            ++tokens;
        }
    } catch (Exception e) {
        LOGGER.error("Error while tokenizing text. Returning.", e);
    } finally {
        IOUtils.closeQuietly(tokenizer);
    }
    return tokens;
}
 
Example 7
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example 8
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testOverlappingAtEnd() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
}
 
Example 9
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testOverlappingAtBeginning() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}
 
Example 10
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
 
Example 11
Source File: XmlInterpolationTest.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 12
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testOverlappingAtEndEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("of", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
}
 
Example 13
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testOverlappingAtBeginningEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}
 
Example 14
Source File: XmlInterpolationTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 15
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
}
 
Example 16
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
public void testAutoPhraseEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "income tax", "tax refund", "property tax"), false);

    final String input = "what is my income tax refund this year now that my property tax is so high";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("what", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax_refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("this", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("year", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("that", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("so", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("high", term.toString());
}
 
Example 17
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
public void testAutoPhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList("income tax", "tax refund", "property tax"), false);

    final String input = "what is my income tax refund this year now that my property tax is so high";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);
    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();
    assertTrue(aptf.incrementToken());
    assertEquals("what", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("income_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("tax_refund", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("this", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("year", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("now", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("that", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("my", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("property_tax", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("so", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("high", term.toString());
}
 
Example 18
Source File: Zemberek2DeASCIIfyFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");

        Map<String, String> map = new HashMap<>();


        Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 19
Source File: Zemberek2StemFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 20
Source File: ReSearcherUtils.java    From solr-researcher with Apache License 2.0 4 votes vote down vote up
/**
 * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query.
 * 
 * @param queryString .
 * @param tokens .
 * @return number of quotes in the query
 */
public static int tokenizeQueryString(String queryString, List<String> tokens) {
  int countOfQuotes = 0;
  
  try {
    // first tokenize words and treat each quote as a separate token
    Map<String,String> args = new HashMap<String, String>();
    args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString());
    WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args);
    
    WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    s.setReader(new StringReader(queryString));
    s.reset();
    
    while (true) {
      CharTermAttribute t = s.getAttribute(CharTermAttribute.class);

      if (t == null) {
        break;
      }
      
      String tokentText = new String(t.toString());
      
      if (tokentText.equals("\"")) {
        tokens.add("\"");
        countOfQuotes++;
      } else if (tokentText.startsWith("\"")) {
        tokens.add("\"");
        countOfQuotes++;
        
        if (tokentText.endsWith("\"")) {
          tokens.add(tokentText.substring(1, tokentText.length() - 1));
          tokens.add("\"");
          countOfQuotes++;
        } else {
          tokens.add(tokentText.substring(1));
        }
      } else if (tokentText.endsWith("\"")) {
        tokens.add(tokentText.substring(0, tokentText.length() - 1));
        tokens.add("\"");
        countOfQuotes++;
      } else if (!tokentText.trim().equals("")) {
        // take into account only if different than empty string
        tokens.add(tokentText);
      }
      
      if (!s.incrementToken()) {
        break;
      }
    }
    s.end();
    s.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return countOfQuotes;
}