Java Code Examples for org.apache.lucene.analysis.TokenStream#reset()

The following examples show how to use org.apache.lucene.analysis.TokenStream#reset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestPatternTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * TODO: rewrite tests not to use string comparison.
 */
private static String tsToString(TokenStream in) throws IOException {
  StringBuilder out = new StringBuilder();
  CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
  // extra safety to enforce, that the state is not preserved and also
  // assign bogus values
  in.clearAttributes();
  termAtt.setEmpty().append("bogusTerm");
  in.reset();
  while (in.incrementToken()) {
    if (out.length() > 0)
      out.append(' ');
    out.append(termAtt.toString());
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
  }

  in.close();
  return out.toString();
}
 
Example 2
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testAfterStrSingleSynonymExpand1() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ab"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/b,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcd"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcde"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1/e,4,5,1");
}
 
Example 3
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testSandwichSynonymExpand2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("bab"));
  stream.reset();
  assertTokenStream(stream, "b,0,1,1/a,1,2,1/aa,1,2,0/b,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("bbabb"));
  stream.reset();
  assertTokenStream(stream, "bb,0,2,1/b,1,2,0/a,2,3,1/aa,2,3,0/b,3,4,1/bb,3,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("dcbabcd"));
  stream.reset();
  assertTokenStream(stream, "dc,0,2,1/cb,1,3,1/b,2,3,0/a,3,4,1/aa,3,4,0/b,4,5,1/bc,4,6,0/cd,5,7,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("edcbabcde"));
  stream.reset();
  assertTokenStream(stream, "ed,0,2,1/dc,1,3,1/cb,2,4,1/b,3,4,0/a,4,5,1/aa,4,5,0/b,5,6,1/bc,5,7,0/cd,6,8,1/de,7,9,1");
}
 
Example 4
Source File: DefaultQueryBuilder.java    From modernmt with Apache License 2.0 6 votes vote down vote up
private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) {
    final int maxClauseCount = BooleanQuery.getMaxClauseCount();
    String text = TokensOutputStream.serialize(sentence, false, true);

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(fieldName, text);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        int count = 0;

        stream.reset();
        while (stream.incrementToken() && (count + 1) < maxClauseCount) {
            Term term = new Term(fieldName, charTermAttribute.toString());
            output.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            count++;
        }
    } catch (IOException e) {
        throw new Error("This should never happen", e);
    } finally {
        closeQuietly(stream);
    }
}
 
Example 5
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testSandwichStr2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/b,1,2,1/a,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bb,1,3,1/a,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1/a,4,5,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1/de,3,5,1/a,5,6,1");
}
 
Example 6
Source File: ChineseWordAnalyzerTest.java    From word with Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if("bigram".equals(WordConfTools.get("ngram", "bigram"))){
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example 7
Source File: Tokenizer.java    From SONDY with GNU General Public License v3.0 5 votes vote down vote up
public static List<String> tokenizeString(org.apache.lucene.analysis.Analyzer analyzer, String string) {
    List<String> result = new ArrayList<>(20);
    try {
        TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.close();
    } catch (IOException e) {}
    return result;
}
 
Example 8
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndPositionIncrement() throws IOException {
  TokenStream source = whitespaceMockTokenizer("seventeen one two three four");
  TokenStream input = new EdgeNGramTokenFilter(source, 8, 8, false);
  PositionIncrementAttribute posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
  input.reset();
  while (input.incrementToken()) {}
  input.end();
  assertEquals(4, posIncAtt.getPositionIncrement());
}
 
Example 9
Source File: LuceneUtil.java    From antsdb with GNU Lesser General Public License v3.0 5 votes vote down vote up
static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}
 
Example 10
Source File: IKAnalyzerUtils.java    From emotional_analysis with Apache License 2.0 5 votes vote down vote up
public static List<String> printAnalysisResult(Analyzer analyzer, String keyWord)  
        throws Exception {  
    TokenStream tokenStream = analyzer.tokenStream("content",  
            new StringReader(keyWord));
    tokenStream.reset();
    tokenStream.addAttribute(CharTermAttribute.class);
    ArrayList<String> keys = new ArrayList<>();
    while (tokenStream.incrementToken()) {  
        CharTermAttribute charTermAttribute = tokenStream  
                .getAttribute(CharTermAttribute.class);  
        keys.add(charTermAttribute.toString());
    }  
    return keys;
}
 
Example 11
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleSynonyms() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa/b,bb");
  TokenStream stream = a.tokenStream("f", new StringReader("ababb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/b,1,2,1/a,2,3,1/bb,3,5,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa/b,bb/c,cc");
  stream = a.tokenStream("f", new StringReader("cba"));
  stream.reset();
  assertTokenStream(stream, "c,0,1,1/b,1,2,1/a,2,3,1");
}
 
Example 12
Source File: TestEmptyTokenStream.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testConsume() throws IOException {
  TokenStream ts = new EmptyTokenStream();
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
  // try again with reuse:
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
}
 
Example 13
Source File: LuceneAnalyzerIntegrationTest.java    From tutorials with MIT License 5 votes vote down vote up
public List<String> analyze(String text, Analyzer analyzer) throws IOException {
    List<String> result = new ArrayList<String>();
    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
    CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        result.add(attr.toString());
    }
    return result;
}
 
Example 14
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}
 
Example 15
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
@Test
public void testSandwichStrExpand3() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cd,2,4,0/d,3,4,0/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cde,2,5,1/de,3,5,0/e,4,5,0/a,5,6,1/aa,5,6,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdefa"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cde,2,5,1/def,3,6,1/ef,4,6,0/f,5,6,0/a,6,7,1/aa,6,7,0");
}
 
Example 16
Source File: MinHashQParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}
 
Example 17
Source File: SimpleNaiveBayesDocumentClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
  Collection<String> tokens = new LinkedList<>();
  CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
  tokenizedText.reset();
  while (tokenizedText.incrementToken()) {
    tokens.add(charTermAttribute.toString());
  }
  tokenizedText.end();
  tokenizedText.close();
  return tokens.toArray(new String[0]);
}
 
Example 18
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testMaxStartOffsetConsistency() throws IOException {
  FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  tvFieldType.setStoreTermVectors(true);
  tvFieldType.setStoreTermVectorOffsets(true);
  tvFieldType.setStoreTermVectorPositions(true);

  Directory dir = newDirectory();

  MockAnalyzer analyzer = new MockAnalyzer(random());
  analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
  Document doc = new Document();
  final String TEXT = " f gg h";
  doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
  doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));

  IndexReader reader;
  try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
    writer.addDocument(doc);
    reader = writer.getReader();
  }
  try {
    Fields tvFields = reader.getTermVectors(0);
    for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
      TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
      TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);

      //assert have same tokens, none of which has a start offset > maxStartOffset
      final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
      final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
      tvStream.reset();
      anaStream.reset();
      while (tvStream.incrementToken()) {
        assertTrue(anaStream.incrementToken());
        assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
        if (maxStartOffset >= 0)
          assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
      }
      assertTrue(anaStream.incrementToken() == false);
      tvStream.end();
      anaStream.end();
      tvStream.close();
      anaStream.close();
    }

  } finally {
    reader.close();
  }



  dir.close();
}
 
Example 19
Source File: LuceneAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
@Test
    public void test1() throws Exception {
        MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

        TokenStream tokenStream = analyzer.tokenStream("title", "商品和服务,上海市副市长,Git有很多优势,其中之一就是远程操作非常简便。本文详细介绍5个Git命令,它们的概念和用法,理解了这些内容,你就会完全掌握Git远程操作。");
        tokenStream.reset();

        StringBuffer sb = new StringBuffer();

        while (tokenStream.incrementToken()) {
            sb.append(tokenStream.getAttribute(CharTermAttribute.class));
            sb.append("\t");
            sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
            sb.append("\t");
            sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
            sb.append("\n");
        }

        analyzer.close();
        System.out.println(sb.toString());
//
//        Assert.assertTrue(sb.toString().equals(
//                "商品\t0\t1\n" +
//                        "服务\t3\t2\n" +
//                        "上海市\t6\t1\n" +
//                        "副市长\t9\t1\n" +
//                        "git\t13\t1\n" +
//                        "很多\t17\t2\n" +
//                        "优势\t19\t1\n" +
//                        "远程\t28\t4\n" +
//                        "操作\t30\t1\n" +
//                        "非常\t32\t1\n" +
//                        "简便\t34\t1\n" +
//                        "本文\t37\t1\n" +
//                        "详细\t39\t1\n" +
//                        "介绍\t41\t1\n" +
//                        "5个\t43\t1\n" +
//                        "git\t45\t1\n" +
//                        "命令\t48\t1\n" +
//                        "概念\t54\t3\n" +
//                        "用法\t57\t2\n" +
//                        "理解\t60\t1\n" +
//                        "内容\t65\t3\n" +
//                        "会\t70\t3\n" +
//                        "完全\t71\t1\n" +
//                        "掌握\t73\t1\n" +
//                        "git\t75\t1\n" +
//                        "远程\t78\t1\n" +
//                        "操作\t80\t1\n"));
    }
 
Example 20
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPayloads() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(true);
  myFieldType.setStoreTermVectorPayloads(true);

  curOffset = 0;

  Token[] tokens = new Token[] {
    getToken("foxes"),
    getToken("can"),
    getToken("jump"),
    getToken("high")
  };

  Document doc = new Document();
  doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
  PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

  ts.reset();
  for(Token token : tokens) {
    assertTrue(ts.incrementToken());
    assertEquals(token.toString(), termAtt.toString());
    assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
    assertEquals(token.getPayload(), payloadAtt.getPayload());
    assertEquals(token.startOffset(), offsetAtt.startOffset());
    assertEquals(token.endOffset(), offsetAtt.endOffset());
  }

  assertFalse(ts.incrementToken());

  reader.close();
  dir.close();
}