Java Code Examples for org.apache.lucene.analysis.Tokenizer#close()

The following examples show how to use org.apache.lucene.analysis.Tokenizer#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestOpenNLPTokenizerFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
public void testClose() throws IOException {
  Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
                                                            put("tokenizerModel", "en-test-tokenizer.bin"); }};
  OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
  factory.inform(new ClasspathResourceLoader(getClass()));

  Tokenizer ts = factory.create(newAttributeFactory());
  ts.setReader(new StringReader(SENTENCES));

  ts.reset();
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
  ts.close();
  ts.reset();
  ts.setReader(new StringReader(SENTENCES));
  assertTokenStreamContents(ts, SENTENCES_punc);
}
 
Example 2
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 6 votes vote down vote up
@Test
public void testShortSentence() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("꽃배달 꽃망울 오토바이"), 2);
  assertEquals(
      "꽃:N:NNG:null:1:1:0:1,배달:N:NNG:null:1:1:1:3,"
      + "꽃:N:NNG:null:1:1:4:5,꽃망울:COMPOUND:Compound:null:0:2:4:7,"
      + "망울:N:NNG:null:1:1:5:7,오토바이:N:NNG:null:1:1:8:12,",
      tokenizerToString(tokenizer));
 
  tokenizer.reset();
  tokenizer.setReader(new StringReader("소설 무궁화꽃이 피었습니다."));
  assertEquals(
      "소설:N:NNG:null:1:1:0:2,무궁:N:NNG:null:1:1:3:5,"
      + "무궁화:COMPOUND:Compound:null:0:2:3:6,화:N:NNG:null:1:1:5:6,"
      + "꽃이:EOJEOL:NNG+JKS:null:1:1:6:8,꽃:N:NNG:null:0:1:6:7,"
      + "피었습니다:EOJEOL:VV+EP+EF:null:1:1:9:14,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 3
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 6 votes vote down vote up
@Ignore
public void testComplexSentence() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader(
          "지금보다 어리고 민감하던 시절 아버지가 충고를 한마디 했는데 " +
          "아직도 그 말이 기억난다."),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "지금보다:EOJEOL:1:1:0:4,지금:N:0:1:0:2,어리고:EOJEOL:1:1:5:8,"
      + "민감하던:EOJEOL:1:1:9:13,민감:XR:0:1:9:11,시절:N:1:1:14:16,"
      + "아버지가:EOJEOL:1:1:17:21,아버지:N:0:1:17:20,충고를:EOJEOL:1:1:22:25,"
      + "충고:N:0:1:22:24,한:N:1:1:26:27,한마디:COMPOUND:0:2:26:29,"
      + "마디:N:1:1:27:29,했는데:EOJEOL:1:1:30:33,아직도:EOJEOL:1:1:34:37,"
      + "아직:MAG:0:1:34:36,그:MM:1:1:38:39,말이:EOJEOL:1:1:40:42,"
      + "말:N:0:1:40:41,기억난다:INFLECT:1:1:43:47,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 4
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 6 votes vote down vote up
@Test
public void testDecompound() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("형태소"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "형태:N:NNG:null:1:1:0:2,형태소:COMPOUND:Compound:null:0:2:0:3,소:N:NNG:null:1:1:2:3,",
      tokenizerToString(tokenizer));
  tokenizer.close();
  
  tokenizer = createTokenizer(
      new StringReader("가고문헌"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "가고:N:NNG:null:1:1:0:2,가고문헌:COMPOUND:Compound:null:0:2:0:4,"
      + "문헌:N:NNG:null:1:1:2:4,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 5
Source File: TestAnsj.java    From ansj4solr with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
Example 6
Source File: TestPatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Nightly
public void testHeapFreedAfterClose() throws Exception {
  // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers?

  // Build a 1MB string:
  StringBuilder b = new StringBuilder();
  for(int i=0;i<1024;i++) {
    // 1023 spaces, then an x
    for(int j=0;j<1023;j++) {
      b.append(' ');
    }
    b.append('x');
  }

  String big = b.toString();

  Pattern x = Pattern.compile("x");

  List<Tokenizer> tokenizers = new ArrayList<>();
  for(int i=0;i<512;i++) {
    Tokenizer stream = new PatternTokenizer(x, -1);
    tokenizers.add(stream);
    stream.setReader(new StringReader(big));
    stream.reset();
    for(int j=0;j<1024;j++) {
      assertTrue(stream.incrementToken());
    }
    assertFalse(stream.incrementToken());
    stream.end();
    stream.close();
  }
}
 
Example 7
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyQuery() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader(""), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(false, tokenizer.incrementToken());
  tokenizer.close();
}
 
Example 8
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyMorphemes() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("!@#$%^&*"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(false, tokenizer.incrementToken());
  tokenizer.close();
}
 
Example 9
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testHanEnglish() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("한글win"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals("한글:N:NNG:null:1:1:0:2,win:SL:SL:null:1:1:2:5,", 
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 10
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoDecompound() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("형태소"),
      TokenGenerator.NO_DECOMPOUND);
  assertEquals("형태소:COMPOUND:NNG:null:1:2:0:3,", tokenizerToString(tokenizer));
  tokenizer.close();
  
  tokenizer = createTokenizer(
      new StringReader("가고문헌"),
      TokenGenerator.NO_DECOMPOUND);
  assertEquals(
      "가고문헌:COMPOUND:NNG:null:1:2:0:4,", tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 11
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testPreanalysisSentence() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("은전한닢 프로젝트는 오픈소스이다."),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "은전:N:NNG:null:1:1:0:2,한:N:NR:null:1:1:2:3,닢:N:NNG:null:1:1:3:4,"
      + "프로젝트는:EOJEOL:NNG+JX:null:1:1:5:10,프로젝트:N:NNG:null:0:1:5:9,"
      + "오픈:N:NNG:null:1:1:11:13,소스이다:EOJEOL:NNG+VCP+EF:null:1:1:13:17,"
      + "소스:N:NNG:null:0:1:13:15,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 12
Source File: MeCabKoStandardTokenizerTest.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnknownSurface() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("걀꿀 없는 단어"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "걀꿀:UNKNOWN:UNKNOWN:null:1:1:0:2,없는:EOJEOL:VA+ETM:null:1:1:3:5,"
      + "단어:N:NNG:null:1:1:6:8,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}