Java Code Examples for org.apache.lucene.analysis.TokenStream#reset()

The following examples show how to use org.apache.lucene.analysis.TokenStream#reset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestPatternTokenizer.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * TODO: rewrite tests not to use string comparison.
 */
private static String tsToString(TokenStream in) throws IOException {
  StringBuilder out = new StringBuilder();
  CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
  // extra safety to enforce, that the state is not preserved and also
  // assign bogus values
  in.clearAttributes();
  termAtt.setEmpty().append("bogusTerm");
  in.reset();
  while (in.incrementToken()) {
    if (out.length() > 0)
      out.append(' ');
    out.append(termAtt.toString());
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
  }

  in.close();
  return out.toString();
}

Example 2

Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0

6 votes

@Test
public void testAfterStrSingleSynonymExpand1() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ab"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/b,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcd"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcde"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/c,2,3,1/d,3,4,1/e,4,5,1");
}

Example 3

Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0

6 votes

@Test
public void testSandwichSynonymExpand2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("bab"));
  stream.reset();
  assertTokenStream(stream, "b,0,1,1/a,1,2,1/aa,1,2,0/b,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("bbabb"));
  stream.reset();
  assertTokenStream(stream, "bb,0,2,1/b,1,2,0/a,2,3,1/aa,2,3,0/b,3,4,1/bb,3,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("dcbabcd"));
  stream.reset();
  assertTokenStream(stream, "dc,0,2,1/cb,1,3,1/b,2,3,0/a,3,4,1/aa,3,4,0/b,4,5,1/bc,4,6,0/cd,5,7,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("edcbabcde"));
  stream.reset();
  assertTokenStream(stream, "ed,0,2,1/dc,1,3,1/cb,2,4,1/b,3,4,0/a,4,5,1/aa,4,5,0/b,5,6,1/bc,5,7,0/cd,6,8,1/de,7,9,1");
}

Example 4

Source File: DefaultQueryBuilder.java From modernmt with Apache License 2.0

6 votes

private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) {
    final int maxClauseCount = BooleanQuery.getMaxClauseCount();
    String text = TokensOutputStream.serialize(sentence, false, true);

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(fieldName, text);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        int count = 0;

        stream.reset();
        while (stream.incrementToken() && (count + 1) < maxClauseCount) {
            Term term = new Term(fieldName, charTermAttribute.toString());
            output.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            count++;
        }
    } catch (IOException e) {
        throw new Error("This should never happen", e);
    } finally {
        closeQuietly(stream);
    }
}

Example 5

Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0

6 votes

@Test
public void testSandwichStr2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/b,1,2,1/a,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bb,1,3,1/a,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1/a,4,5,1");

  a = new NGramSynonymTokenizerTestAnalyzer(2, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/bc,1,3,1/cd,2,4,1/de,3,5,1/a,5,6,1");
}

Example 6

Source File: ChineseWordAnalyzerTest.java From word with Apache License 2.0

6 votes

@Test
public void test1() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if("bigram".equals(WordConfTools.get("ngram", "bigram"))){
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}

Example 7

Source File: Tokenizer.java From SONDY with GNU General Public License v3.0

5 votes

public static List<String> tokenizeString(org.apache.lucene.analysis.Analyzer analyzer, String string) {
    List<String> result = new ArrayList<>(20);
    try {
        TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.close();
    } catch (IOException e) {}
    return result;
}

Example 8

Source File: EdgeNGramTokenFilterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testEndPositionIncrement() throws IOException {
  TokenStream source = whitespaceMockTokenizer("seventeen one two three four");
  TokenStream input = new EdgeNGramTokenFilter(source, 8, 8, false);
  PositionIncrementAttribute posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
  input.reset();
  while (input.incrementToken()) {}
  input.end();
  assertEquals(4, posIncAtt.getPositionIncrement());
}

Example 9

Source File: LuceneUtil.java From antsdb with GNU Lesser General Public License v3.0

5 votes

static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}

Example 10

Source File: IKAnalyzerUtils.java From emotional_analysis with Apache License 2.0

5 votes

public static List<String> printAnalysisResult(Analyzer analyzer, String keyWord)  
        throws Exception {  
    TokenStream tokenStream = analyzer.tokenStream("content",  
            new StringReader(keyWord));
    tokenStream.reset();
    tokenStream.addAttribute(CharTermAttribute.class);
    ArrayList<String> keys = new ArrayList<>();
    while (tokenStream.incrementToken()) {  
        CharTermAttribute charTermAttribute = tokenStream  
                .getAttribute(CharTermAttribute.class);  
        keys.add(charTermAttribute.toString());
    }  
    return keys;
}

Example 11

Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0

5 votes

@Test
public void testMultipleSynonyms() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa/b,bb");
  TokenStream stream = a.tokenStream("f", new StringReader("ababb"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/b,1,2,1/a,2,3,1/bb,3,5,1");

  a = new NGramSynonymTokenizerTestAnalyzer(1, false, "a,aa/b,bb/c,cc");
  stream = a.tokenStream("f", new StringReader("cba"));
  stream.reset();
  assertTokenStream(stream, "c,0,1,1/b,1,2,1/a,2,3,1");
}

Example 12

Source File: TestEmptyTokenStream.java From lucene-solr with Apache License 2.0

5 votes

public void testConsume() throws IOException {
  TokenStream ts = new EmptyTokenStream();
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
  // try again with reuse:
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
}

Example 13

Source File: LuceneAnalyzerIntegrationTest.java From tutorials with MIT License

5 votes

public List<String> analyze(String text, Analyzer analyzer) throws IOException {
    List<String> result = new ArrayList<String>();
    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
    CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        result.add(attr.toString());
    }
    return result;
}

Example 14

Source File: TestJapaneseNumberFilter.java From lucene-solr with Apache License 2.0

5 votes

public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}

Example 15

Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0

5 votes

@Test
public void testSandwichStrExpand3() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cd,2,4,0/d,3,4,0/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cde,2,5,1/de,3,5,0/e,4,5,0/a,5,6,1/aa,5,6,0");

  a = new NGramSynonymTokenizerTestAnalyzer(3, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdefa"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/bcd,1,4,0/cde,2,5,1/def,3,6,1/ef,4,6,0/f,5,6,0/a,6,7,1/aa,6,7,0");
}

Example 16

Source File: MinHashQParser.java From lucene-solr with Apache License 2.0

5 votes

private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}

Example 17

Source File: SimpleNaiveBayesDocumentClassifier.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
  Collection<String> tokens = new LinkedList<>();
  CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
  tokenizedText.reset();
  while (tokenizedText.incrementToken()) {
    tokens.add(charTermAttribute.toString());
  }
  tokenizedText.end();
  tokenizedText.close();
  return tokens.toArray(new String[0]);
}

Example 18

Source File: TokenSourcesTest.java From lucene-solr with Apache License 2.0

4 votes

public void testMaxStartOffsetConsistency() throws IOException {
  FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  tvFieldType.setStoreTermVectors(true);
  tvFieldType.setStoreTermVectorOffsets(true);
  tvFieldType.setStoreTermVectorPositions(true);

  Directory dir = newDirectory();

  MockAnalyzer analyzer = new MockAnalyzer(random());
  analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
  Document doc = new Document();
  final String TEXT = " f gg h";
  doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
  doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));

  IndexReader reader;
  try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
    writer.addDocument(doc);
    reader = writer.getReader();
  }
  try {
    Fields tvFields = reader.getTermVectors(0);
    for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
      TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
      TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);

      //assert have same tokens, none of which has a start offset > maxStartOffset
      final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
      final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
      tvStream.reset();
      anaStream.reset();
      while (tvStream.incrementToken()) {
        assertTrue(anaStream.incrementToken());
        assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
        if (maxStartOffset >= 0)
          assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
      }
      assertTrue(anaStream.incrementToken() == false);
      tvStream.end();
      anaStream.end();
      tvStream.close();
      anaStream.close();
    }

  } finally {
    reader.close();
  }



  dir.close();
}

Example 19

Source File: LuceneAnalyzerTest.java From jstarcraft-nlp with Apache License 2.0

4 votes

@Test
    public void test1() throws Exception {
        MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

        TokenStream tokenStream = analyzer.tokenStream("title", "商品和服务，上海市副市长，Git有很多优势，其中之一就是远程操作非常简便。本文详细介绍5个Git命令，它们的概念和用法，理解了这些内容，你就会完全掌握Git远程操作。");
        tokenStream.reset();

        StringBuffer sb = new StringBuffer();

        while (tokenStream.incrementToken()) {
            sb.append(tokenStream.getAttribute(CharTermAttribute.class));
            sb.append("\t");
            sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
            sb.append("\t");
            sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
            sb.append("\n");
        }

        analyzer.close();
        System.out.println(sb.toString());
//
//        Assert.assertTrue(sb.toString().equals(
//                "商品\t0\t1\n" +
//                        "服务\t3\t2\n" +
//                        "上海市\t6\t1\n" +
//                        "副市长\t9\t1\n" +
//                        "git\t13\t1\n" +
//                        "很多\t17\t2\n" +
//                        "优势\t19\t1\n" +
//                        "远程\t28\t4\n" +
//                        "操作\t30\t1\n" +
//                        "非常\t32\t1\n" +
//                        "简便\t34\t1\n" +
//                        "本文\t37\t1\n" +
//                        "详细\t39\t1\n" +
//                        "介绍\t41\t1\n" +
//                        "5个\t43\t1\n" +
//                        "git\t45\t1\n" +
//                        "命令\t48\t1\n" +
//                        "概念\t54\t3\n" +
//                        "用法\t57\t2\n" +
//                        "理解\t60\t1\n" +
//                        "内容\t65\t3\n" +
//                        "会\t70\t3\n" +
//                        "完全\t71\t1\n" +
//                        "掌握\t73\t1\n" +
//                        "git\t75\t1\n" +
//                        "远程\t78\t1\n" +
//                        "操作\t80\t1\n"));
    }

Example 20

Source File: TokenSourcesTest.java From lucene-solr with Apache License 2.0

4 votes

public void testPayloads() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(true);
  myFieldType.setStoreTermVectorPayloads(true);

  curOffset = 0;

  Token[] tokens = new Token[] {
    getToken("foxes"),
    getToken("can"),
    getToken("jump"),
    getToken("high")
  };

  Document doc = new Document();
  doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
  PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

  ts.reset();
  for(Token token : tokens) {
    assertTrue(ts.incrementToken());
    assertEquals(token.toString(), termAtt.toString());
    assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
    assertEquals(token.getPayload(), payloadAtt.getPayload());
    assertEquals(token.startOffset(), offsetAtt.startOffset());
    assertEquals(token.endOffset(), offsetAtt.endOffset());
  }

  assertFalse(ts.incrementToken());

  reader.close();
  dir.close();
}