Java Code Examples for org.apache.lucene.analysis.TokenStream

The following examples show how to use org.apache.lucene.analysis.TokenStream. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test
public void testPrevStrSingleSynonym3() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ba"));
  stream.reset();
  assertTokenStream(stream, "b,0,1,1/a,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("bba"));
  stream.reset();
  assertTokenStream(stream, "bb,0,2,1/a,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("dcba"));
  stream.reset();
  assertTokenStream(stream, "dcb,0,3,1/a,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("edcba"));
  stream.reset();
  assertTokenStream(stream, "edc,0,3,1/dcb,1,4,1/a,4,5,1");
}
 
Example 2
@Test
public void testSandwichStrExpand2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/d,3,4,0/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/de,3,5,1/e,4,5,0/a,5,6,1/aa,5,6,0");
}
 
Example 3
Source Project: lucene-solr   Source File: HighlighterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testHighlightingSynonymQuery() throws Exception {
  searcher = newSearcher(reader);
  Query query = new SynonymQuery.Builder(FIELD_NAME)
      .addTerm(new Term(FIELD_NAME, "jfk"))
      .addTerm(new Term(FIELD_NAME, "kennedy"))
      .build();
  QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
  Highlighter highlighter = new Highlighter(scorer);
  TokenStream stream = getAnyTokenStream(FIELD_NAME, 2);
  Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
  highlighter.setTextFragmenter(fragmenter);
  String storedField = searcher.doc(2).get(FIELD_NAME);
  String fragment = highlighter.getBestFragment(stream, storedField);
  assertEquals("<B>JFK</B> has been shot", fragment);

  stream = getAnyTokenStream(FIELD_NAME, 3);
  storedField = searcher.doc(3).get(FIELD_NAME);
  fragment = highlighter.getBestFragment(stream, storedField);
  assertEquals("John <B>Kennedy</B> has been shot", fragment);
}
 
Example 4
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example 5
Source Project: lucene-solr   Source File: TestPackedTokenAttributeImpl.java    License: Apache License 2.0 6 votes vote down vote up
public void testPackedTokenAttributeFactory() throws Exception {
  TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  ((Tokenizer)ts).setReader(new StringReader("foo bar"));
  
  assertTrue("CharTermAttribute is not implemented by Token",
    ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("OffsetAttribute is not implemented by Token",
    ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("PositionIncrementAttribute is not implemented by Token", 
    ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("TypeAttribute is not implemented by Token",
    ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);

  assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
      ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);  
}
 
Example 6
Source Project: russianmorphology   Source File: AnalyzersTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
    Analyzer morphlogyAnalyzer = new RussianAnalyzer();
    InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");

    TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
    tokenStream.reset();
    Set<String> foromsOfWine = new HashSet<String>();
    foromsOfWine.add("вина");
    foromsOfWine.add("винo");
    boolean wordSeen = false;
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
        if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
            assertThat(position.getPositionIncrement(),equalTo(0));
        }
        if(foromsOfWine.contains(charTerm.toString())){
            wordSeen = true;
        }
    }
}
 
Example 7
Source Project: lucene-solr   Source File: TestNGramFilters.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Test NGramFilterFactory on tokens with payloads
 */
public void testNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 8
Source Project: lucene-solr   Source File: TestKeepWordFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Set<String> words = new HashSet<>();
  words.add( "a" );
  words.add( "b" );
  
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example 9
Source Project: modernmt   Source File: DefaultQueryBuilder.java    License: Apache License 2.0 6 votes vote down vote up
private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) {
    final int maxClauseCount = BooleanQuery.getMaxClauseCount();
    String text = TokensOutputStream.serialize(sentence, false, true);

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(fieldName, text);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        int count = 0;

        stream.reset();
        while (stream.incrementToken() && (count + 1) < maxClauseCount) {
            Term term = new Term(fieldName, charTermAttribute.toString());
            output.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            count++;
        }
    } catch (IOException e) {
        throw new Error("This should never happen", e);
    } finally {
        closeQuietly(stream);
    }
}
 
Example 10
Source Project: lucene-solr   Source File: TestFlattenGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testHoleUnderSyn() throws Exception {
  // Tests a StopFilter after SynFilter where a stopword in a syn is removed
  //
  //   wizard of oz -> woz syn, but then "of" becomes a hole

  TokenStream in = new CannedTokenStream(0, 12, new Token[] {
      token("wizard", 1, 1, 0, 6),
      token("woz", 0, 3, 0, 12),
      token("oz", 2, 1, 10, 12),
    });


  TokenStream out = new FlattenGraphFilter(in);

  assertTokenStreamContents(out,
                            new String[] {"wizard", "woz", "oz"},
                            new int[] {0, 0, 10},
                            new int[] {6, 12, 12},
                            new int[] {1, 0, 2},
                            new int[] {1, 3, 1},
                            12);
}
 
Example 11
Source Project: word   Source File: ChineseWordAnalyzerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if("bigram".equals(WordConfTools.get("ngram", "bigram"))){
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example 12
Source Project: lucene-solr   Source File: TestProtectedTermFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testBasic() throws IOException {

    CannedTokenStream cts = new CannedTokenStream(
        new Token("Alice", 1, 0, 5),
        new Token("Bob", 1, 6, 9),
        new Token("Clara", 1, 10, 15),
        new Token("David", 1, 16, 21)
    );

    CharArraySet protectedTerms = new CharArraySet(5, true);
    protectedTerms.add("bob");

    TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new);
    assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });

  }
 
Example 13
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
  final List<AttributeSource> tokens = new ArrayList<>();
  try {
    tokenStream.reset();
    CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      tokens.add(tokenStream.cloneAttributes());
      List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
      result.add(new Token(charAtt.toString(), attributes));
    }
    tokenStream.end();
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }
  return tokens;
}
 
Example 14
Source Project: lucene-solr   Source File: ShingleAnalyzerWrapperTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
    int j = -1;
  
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      builder.add(new Term("content", termText), j);
    }
    ts.end();
  }

  PhraseQuery q = builder.build();
  ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
  int[] ranks = new int[] { 0 };
  compareRanks(hits, ranks);
}
 
Example 15
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 16
Source Project: lucene-solr   Source File: TestFlattenGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testStrangelyNumberedNodes() throws Exception {

    // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
    TokenStream in = new CannedTokenStream(0, 27, new Token[] {
        token("dog", 1, 3, 0, 5),
        token("puppy", 0, 3, 0, 5),
        token("flies", 3, 1, 6, 11),
      });

    TokenStream out = new FlattenGraphFilter(in);

    assertTokenStreamContents(out,
                              new String[] {"dog", "puppy", "flies"},
                              new int[] {0, 0, 6},
                              new int[] {5, 5, 11},
                              new int[] {1, 0, 1},
                              new int[] {1, 1, 1},
                              27);
  }
 
Example 17
Source Project: SolrTextTagger   Source File: Tagger.java    License: Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example 18
/**
 * Query generation expression extracted from
 * {@link org.airsonic.player.service.SearchService#getRandomSongs(RandomSearchCriteria)}.
 */
public Query getRandomSongs(RandomSearchCriteria criteria) throws IOException {

    BooleanQuery.Builder query = new BooleanQuery.Builder();

    Analyzer analyzer = analyzerFactory.getQueryAnalyzer();

    // Unanalyzed field
    query.add(new TermQuery(new Term(FieldNames.MEDIA_TYPE, MediaType.MUSIC.name())), Occur.MUST);

    if (!isEmpty(criteria.getGenre())) {

        // Unanalyzed field, but performs filtering according to id3 tag parser.
        try (TokenStream stream = analyzer.tokenStream(FieldNames.GENRE, criteria.getGenre())) {
            stream.reset();
            if (stream.incrementToken()) {
                String token = stream.getAttribute(CharTermAttribute.class).toString();
                query.add(new TermQuery(new Term(FieldNames.GENRE, token)), Occur.MUST);
            }
        }
    }

    if (!(isEmpty(criteria.getFromYear()) && isEmpty(criteria.getToYear()))) {
        query.add(toYearRangeQuery.apply(criteria.getFromYear(), criteria.getToYear()), Occur.MUST);
    }

    query.add(toFolderQuery.apply(false, criteria.getMusicFolders()), Occur.MUST);

    return query.build();
}
 
Example 19
Source Project: lucene-solr   Source File: MinHashFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenStream1() throws IOException {
  String[] hashes = new String[]{"℁팽徭聙↝ꇁ홱杯",
      new String(new char[]{36347, 63457, 43013, 56843, 52284, 34231, 57934, 42302})}; // String is degenerate as
  // characters!

  TokenStream ts = createTokenStream(5, "woof woof woof woof woof" + " " + "woof woof woof woof puff", 1, 1, 100,
      false);
  assertTokenStreamContents(ts, hashes, new int[]{0, 0},
      new int[]{49, 49}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
      new int[]{1, 1}, 49, 0, null, true, null);
}
 
Example 20
Source Project: Elasticsearch   Source File: XMoreLikeThis.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 */
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException("To use MoreLikeThis without " +
                "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }
            if (isSkipTerm(fieldName, word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    }
}
 
Example 21
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i++], termAttr.toString());
    }
    assertEquals(i, expected.length);
    stream.close();
}
 
Example 22
Source Project: lucene-solr   Source File: TestKoreanTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
    ts.reset();
    for(String reading : readings) {
      assertTrue(ts.incrementToken());
      assertEquals(reading, readingAtt.getReading());
    }
    assertFalse(ts.incrementToken());
    ts.end();
  }
}
 
Example 23
Source Project: database   Source File: PatternAnalyzerImpl.java    License: GNU General Public License v2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String field) {
	//Use default grouping
	final Tokenizer tokenizer = new PatternTokenizer(pattern,-1);
	final TokenStream filter = new LowerCaseFilter(tokenizer);
	return new TokenStreamComponents(tokenizer, filter);
}
 
Example 24
@Override
public TokenStream create(TokenStream tokenStream) {
    if (version.onOrAfter(Version.LUCENE_4_4_0)) {
        return new HyphenationCompoundWordTokenFilter(tokenStream, hyphenationTree, wordList, minWordSize, 
                                                      minSubwordSize, maxSubwordSize, onlyLongestMatch);
    } else {
        return new Lucene43HyphenationCompoundWordTokenFilter(tokenStream, hyphenationTree, wordList, minWordSize, 
                minSubwordSize, maxSubwordSize, onlyLongestMatch);
    }
}
 
Example 25
Source Project: lucene-solr   Source File: Field.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
  if (fieldType().indexOptions() == IndexOptions.NONE) {
    // Not indexed
    return null;
  }

  if (!fieldType().tokenized()) {
    if (stringValue() != null) {
      if (!(reuse instanceof StringTokenStream)) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed
        reuse = new StringTokenStream();
      }
      ((StringTokenStream) reuse).setValue(stringValue());
      return reuse;
    } else if (binaryValue() != null) {
      if (!(reuse instanceof BinaryTokenStream)) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed
        reuse = new BinaryTokenStream();
      }
      ((BinaryTokenStream) reuse).setValue(binaryValue());
      return reuse;
    } else {
      throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
    }
  }

  if (tokenStream != null) {
    return tokenStream;
  } else if (readerValue() != null) {
    return analyzer.tokenStream(name(), readerValue());
  } else if (stringValue() != null) {
    return analyzer.tokenStream(name(), stringValue());
  }

  throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
}
 
Example 26
Source Project: lucene-solr   Source File: HighlighterPhraseTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException {
  final String TEXT = "the fox did not jump";
  final Directory directory = newDirectory();
  final IndexWriter indexWriter = new IndexWriter(directory,
      newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
  try {
    final Document document = new Document();

    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectorOffsets(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectors(true);
    document.add(new Field(FIELD, new TokenStreamSparse(), customType));
    indexWriter.addDocument(document);
  } finally {
    indexWriter.close();
  }
  final IndexReader indexReader = DirectoryReader.open(directory);
  try {
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "did", "jump");
    TopDocs hits = indexSearcher.search(phraseQuery, 1);
    assertEquals(0, hits.totalHits.value);
    final Highlighter highlighter = new Highlighter(
        new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
        new QueryScorer(phraseQuery));
    final TokenStream tokenStream =
        TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
    assertEquals(
        highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
        highlighter.getBestFragment(tokenStream, TEXT));
  } finally {
    indexReader.close();
    directory.close();
  }
}
 
Example 27
public void testNothingChange() throws Exception {
  Reader reader = new StringReader("this is test.");
  reader = charFilterFactory("PatternReplace",
      "pattern", "(aa)\\s+(bb)\\s+(cc)",
      "replacement", "$1$2$3").create(reader);
  TokenStream ts = whitespaceMockTokenizer(reader);
  assertTokenStreamContents(ts,
      new String[] { "this", "is", "test." },
      new int[] { 0, 5, 8 },
      new int[] { 4, 7, 13 });
}
 
Example 28
public void testKeepIgnoreCase() throws Exception {
  Reader reader = new StringReader("kiTTEN");
  TokenStream stream = keywordMockTokenizer(reader);
  stream = tokenFilterFactory("Capitalization",
      "keep", "kitten",
      "keepIgnoreCase", "true",
      "onlyFirstWord", "true",
      "forceFirstLetter", "true").create(stream);

  assertTokenStreamContents(stream, new String[] { "KiTTEN" });
}
 
Example 29
Source Project: lucene-solr   Source File: TestConcatenatingTokenStream.java    License: Apache License 2.0 5 votes vote down vote up
public void testBasic() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    third.setReader(new StringReader(" third words"));

    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 });

    // test re-use
    first.setReader(new StringReader("first words "));
    second.setReader(new StringReader("second words"));
    third.setReader(new StringReader(" third words"));
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 },
        new int[]{ 1, 1, 1, 1, 1, 1 });

  }
 
Example 30
Source Project: lucene-solr   Source File: TestHyphenatedWordsFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testOffsets() throws Exception {
  String input = "abc- def geh 1234- 5678-";
  TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)ts).setReader(new StringReader(input));
  ts = new HyphenatedWordsFilter(ts);
  assertTokenStreamContents(ts, 
      new String[] { "abcdef", "geh", "12345678-" },
      new int[] { 0, 9, 13 },
      new int[] { 8, 12, 24 });
}