Java Code Examples for org.apache.lucene.analysis.TokenStream#incrementToken()

The following examples show how to use org.apache.lucene.analysis.TokenStream#incrementToken() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AnalyzerFactoryTestCase.java    From airsonic-advanced with GNU General Public License v3.0 6 votes vote down vote up
@SuppressWarnings("unused")
private List<String> toQueryTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getQueryAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}
 
Example 2
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
  BooleanQuery.Builder q = newBooleanQuery();
  List<TermAndBoost> currentQuery = new ArrayList<>();
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);

  stream.reset();
  while (stream.incrementToken()) {
    if (posIncrAtt.getPositionIncrement() != 0) {
      add(q, currentQuery, operator);
      currentQuery.clear();
    }
    currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  add(q, currentQuery, operator);
  
  return q.build();
}
 
Example 3
Source File: HanLPTokenizerTest.java    From hanlp-lucene-plugin with Apache License 2.0 6 votes vote down vote up
public void testPinyinTokenFilter() throws Exception
{
    Map<String, String> args = new HashMap<>();
    args.put("original", "true");
    args.put("pinyin", "false");
    args.put("pinyinFirstChar", "true");
    HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args);
    TokenStream tokenStream = factory.create(tokenizer);
    while (tokenStream.incrementToken())
    {
        CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
        // 偏移量
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        // 距离
        PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
        // 词性
        TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
        System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
    }
}
 
Example 4
Source File: ChineseWordAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() {
    try {
        Analyzer analyzer = new WordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[叔叔, 亲了, 我, 妈妈, 也, 亲了, 我]";
        Assert.assertEquals(expResult, words.toString());
    } catch (IOException e) {
        Assert.fail("分词出错" + e.getMessage());
    }
}
 
Example 5
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example 6
Source File: TestDelimitedPayloadTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 7
Source File: TokenizeTextBuilder.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
protected boolean doProcess(Record record) {
  try {
    List outputValues = record.get(outputFieldName);
    for (Object value : record.get(inputFieldName)) {
      reader.setValue(value.toString());
      TokenStream tokenStream = analyzer.tokenStream("content", reader);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        if (token.length() > 0) { // incrementToken() updates the token!
          String tokenStr = new String(token.buffer(), 0, token.length());
          outputValues.add(tokenStr);
        }
      }
      tokenStream.end();
      tokenStream.close();
    }
  } catch (IOException e) {
    throw new MorphlineRuntimeException(e);
  }
  
  // pass record to next command in chain:
  return super.doProcess(record);
}
 
Example 8
Source File: AnalyzerFactoryTestCase.java    From airsonic-advanced with GNU General Public License v3.0 6 votes vote down vote up
private List<String> toTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}
 
Example 9
Source File: RevisedLesk.java    From lesk-wsd-dsm with GNU General Public License v3.0 5 votes vote down vote up
/**
 *
 * @param text
 * @return
 * @throws IOException
 */
public Map<String, Float> buildBag(String text) throws IOException {
    Map<String, Float> bag = new HashMap<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    SnowballStemmer stemmer = null;
    if (stemming) {
        stemmer = getStemmer(language);
        if (stemmer == null) {
            Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language);
        }
    }
    TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
    while (tokenStream.incrementToken()) {
        TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        String term = token.term();
        if (stemmer != null) {
            stemmer.setCurrent(term);
            if (stemmer.stem()) {
                term = stemmer.getCurrent();
            }
        }
        Float c = bag.get(term);
        if (c == null) {
            bag.put(term, 1f);
        } else {
            bag.put(term, c + 1f);
        }
    }
    return bag;
}
 
Example 10
Source File: LuceneUtil.java    From antsdb with GNU Lesser General Public License v3.0 5 votes vote down vote up
static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}
 
Example 11
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** 
 * Creates simple boolean query from the cached tokenstream contents 
 */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  
  stream.reset();
  List<TermAndBoost> terms = new ArrayList<>();
  while (stream.incrementToken()) {
    terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  
  return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
}
 
Example 12
Source File: SimpleNaiveBayesDocumentClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
  Collection<String> tokens = new LinkedList<>();
  CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
  tokenizedText.reset();
  while (tokenizedText.incrementToken()) {
    tokens.add(charTermAttribute.toString());
  }
  tokenizedText.end();
  tokenizedText.close();
  return tokens.toArray(new String[0]);
}
 
Example 13
Source File: StringFieldType.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {

    MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
    mpqb.setSlop(slop);

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);

    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    int position = -1;

    List<Term> multiTerms = new ArrayList<>();
    stream.reset();
    while (stream.incrementToken()) {
        int positionIncrement = posIncrAtt.getPositionIncrement();

        if (positionIncrement > 0 && multiTerms.size() > 0) {
            if (enablePositionIncrements) {
                mpqb.add(multiTerms.toArray(new Term[0]), position);
            } else {
                mpqb.add(multiTerms.toArray(new Term[0]));
            }
            multiTerms.clear();
        }
        position += positionIncrement;
        multiTerms.add(new Term(field, termAtt.getBytesRef()));
    }

    if (enablePositionIncrements) {
        mpqb.add(multiTerms.toArray(new Term[0]), position);
    } else {
        mpqb.add(multiTerms.toArray(new Term[0]));
    }
    return mpqb.build();
}
 
Example 14
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndPositionIncrement() throws IOException {
  TokenStream source = whitespaceMockTokenizer("seventeen one two three four");
  TokenStream input = new EdgeNGramTokenFilter(source, 8, 8, false);
  PositionIncrementAttribute posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
  input.reset();
  while (input.incrementToken()) {}
  input.end();
  assertEquals(4, posIncAtt.getPositionIncrement());
}
 
Example 15
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 5 votes vote down vote up
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i], termAttr.toString());
        i++;
    }
    assertEquals(i, expected.length);
    stream.close();
}
 
Example 16
Source File: LuceneAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
@Test
    public void test1() throws Exception {
        MynlpAnalyzer analyzer = new MynlpAnalyzer(Lexers.core().filterReader(true, true));

        TokenStream tokenStream = analyzer.tokenStream("title", "商品和服务,上海市副市长,Git有很多优势,其中之一就是远程操作非常简便。本文详细介绍5个Git命令,它们的概念和用法,理解了这些内容,你就会完全掌握Git远程操作。");
        tokenStream.reset();

        StringBuffer sb = new StringBuffer();

        while (tokenStream.incrementToken()) {
            sb.append(tokenStream.getAttribute(CharTermAttribute.class));
            sb.append("\t");
            sb.append(tokenStream.getAttribute(OffsetAttribute.class).startOffset());
            sb.append("\t");
            sb.append(tokenStream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
            sb.append("\n");
        }

        analyzer.close();
        System.out.println(sb.toString());
//
//        Assert.assertTrue(sb.toString().equals(
//                "商品\t0\t1\n" +
//                        "服务\t3\t2\n" +
//                        "上海市\t6\t1\n" +
//                        "副市长\t9\t1\n" +
//                        "git\t13\t1\n" +
//                        "很多\t17\t2\n" +
//                        "优势\t19\t1\n" +
//                        "远程\t28\t4\n" +
//                        "操作\t30\t1\n" +
//                        "非常\t32\t1\n" +
//                        "简便\t34\t1\n" +
//                        "本文\t37\t1\n" +
//                        "详细\t39\t1\n" +
//                        "介绍\t41\t1\n" +
//                        "5个\t43\t1\n" +
//                        "git\t45\t1\n" +
//                        "命令\t48\t1\n" +
//                        "概念\t54\t3\n" +
//                        "用法\t57\t2\n" +
//                        "理解\t60\t1\n" +
//                        "内容\t65\t3\n" +
//                        "会\t70\t3\n" +
//                        "完全\t71\t1\n" +
//                        "掌握\t73\t1\n" +
//                        "git\t75\t1\n" +
//                        "远程\t78\t1\n" +
//                        "操作\t80\t1\n"));
    }
 
Example 17
Source File: JsonPreAnalyzedParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public String toFormattedString(Field f) throws IOException {
  Map<String,Object> map = new LinkedHashMap<>();
  map.put(VERSION_KEY, VERSION);
  if (f.fieldType().stored()) {
    String stringValue = f.stringValue();
    if (stringValue != null) {
      map.put(STRING_KEY, stringValue);
    }
    BytesRef binaryValue = f.binaryValue();
    if (binaryValue != null) {
      map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    List<Map<String,Object>> tokens = new LinkedList<>();
    while (ts.incrementToken()) {
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      Map<String,Object> tok = new TreeMap<>();
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = new String(catt.buffer(), 0, catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          tTerm = tatt.getBytesRef().utf8ToString();
        } else {
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset());
            tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.put(TYPE_KEY, ((TypeAttribute)att).type());
          } else {
            tok.put(cl.getName(), att.toString());
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        tok.put(TOKEN_KEY, term);
      }
      tokens.add(tok);
    }
    map.put(TOKENS_KEY, tokens);
  }
  return JSONUtil.toJSON(map, -1);
}
 
Example 18
Source File: TermFilteredPresearcher.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public final Query buildQuery(LeafReader reader, BiPredicate<String, BytesRef> termAcceptor) {
  try {
    DocumentQueryBuilder queryBuilder = getQueryBuilder();
    for (FieldInfo field : reader.getFieldInfos()) {

      Terms terms = reader.terms(field.name);
      if (terms == null) {
        continue;
      }

      TokenStream ts = new TermsEnumTokenStream(terms.iterator());
      for (CustomQueryHandler handler : queryHandlers) {
        ts = handler.wrapTermStream(field.name, ts);
      }

      ts = new FilteringTokenFilter(ts) {
        TermToBytesRefAttribute termAtt = addAttribute(TermToBytesRefAttribute.class);
        @Override
        protected boolean accept() {
          return filterFields.contains(field.name) == false && termAcceptor.test(field.name, termAtt.getBytesRef());
        }
      };

      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      while (ts.incrementToken()) {
        queryBuilder.addTerm(field.name, BytesRef.deepCopyOf(termAtt.getBytesRef()));
      }
      ts.close();

    }
    Query presearcherQuery = queryBuilder.build();

    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    bq.add(presearcherQuery, BooleanClause.Occur.SHOULD);
    bq.add(new TermQuery(new Term(ANYTOKEN_FIELD, ANYTOKEN)), BooleanClause.Occur.SHOULD);
    presearcherQuery = bq.build();
    if (filterFields.isEmpty() == false) {
      bq = new BooleanQuery.Builder();
      bq.add(presearcherQuery, BooleanClause.Occur.MUST);
      Query filterQuery = buildFilterFields(reader);
      if (filterQuery != null) {
        bq.add(filterQuery, BooleanClause.Occur.FILTER);
        presearcherQuery = bq.build();
      }
    }
    return presearcherQuery;
  } catch (IOException e) {
    // We're a MemoryIndex, so this shouldn't happen...
    throw new RuntimeException(e);
  }
}
 
Example 19
Source File: SimplePreAnalyzedParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public String toFormattedString(Field f) throws IOException {
  StringBuilder sb = new StringBuilder();
  sb.append(VERSION + " ");
  if (f.fieldType().stored()) {
    String s = f.stringValue();
    if (s != null) {
      // encode the equals sign
      s = s.replaceAll("=", "\\=");
      sb.append('=');
      sb.append(s);
      sb.append('=');
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    StringBuilder tok = new StringBuilder();
    boolean next = false;
    while (ts.incrementToken()) {
      if (next) {
        sb.append(' ');
      } else {
        next = true;
      }
      tok.setLength(0);
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = escape(catt.buffer(), catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
          tTerm = escape(tTermChars, tTermChars.length);
        } else {
          if (tok.length() > 0) tok.append(',');
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length));
            } else if (tok.length() > 0) {
              tok.setLength(tok.length() - 1); // remove the last comma
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.append("y=").append(escape(((TypeAttribute) att).type()));
          } else {
            
            tok.append(cl.getName()).append('=').append(escape(att.toString()));
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        if (tok.length() > 0) {
          tok.insert(0, term + ",");
        } else {
          tok.insert(0, term);
        }
      }
      sb.append(tok);
    }
  }
  return sb.toString();
}
 
Example 20
Source File: TestLindenWordDelimiterAnalyzer.java    From linden with Apache License 2.0 4 votes vote down vote up
@Test
public void testLindenWordDelimiterAnalyzer() throws Exception {
  LindenWordDelimiterAnalyzerFactory wordDelimiterAnalyzerFactory = new LindenWordDelimiterAnalyzerFactory();
  Map<String, String> args = new HashMap<>();
  Map<String, String> lastargs = new HashMap<>();
  args.put("luceneMatchVersion", "LUCENE_4_10_0");
  lastargs.putAll(args);
  Analyzer analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
  TokenStream stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " +
                                                                 "你好,这是一个测试的实例。"
                                                                 + "created2018by sls sun-li-shun SunLiShun"));
  String
      expected =
      "[hello][test][case][你][好][这][是][一][个][测][试][的][实][例][created][2018][sls][sun][li][shun][sun][li][shun]";
  String out = "";
  stream.reset();
  while (stream.incrementToken()) {
    out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
  }
  Assert.assertEquals(expected, out);

  args.put("lower.case", "false");
  args.putAll(lastargs);
  lastargs.putAll(args);
  analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
  stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " +
                                                     "你好,这是一个测试的实例。" + "created2018by sls on 20140707"));
  expected =
      "[Hello][test][case][你][好][这][是][一][个][测][试][的][实][例][created][2018][sls][20140707]";
  out = "";
  stream.reset();
  while (stream.incrementToken()) {
    out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
  }
  Assert.assertEquals(expected, out);

  args.put("set.stopwords", "false");
  args.putAll(lastargs);
  lastargs.putAll(args);
  analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
  stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " +
                                                     "你好,这是一个测试的实例。" + "created2018by sls on 20140707"));
  expected =
      "[Hello][this][is][a][test][case][你][好][这][是][一][个][测][试][的][实][例][created][2018][by][sls][on][20140707]";
  out = "";
  stream.reset();
  while (stream.incrementToken()) {
    out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
  }
  Assert.assertEquals(expected, out);

  args.putAll(lastargs);
  args.put("splitOnCaseChange", "0");
  args.put("set.stopwords", "false");
  args.put("lower.case", "true");
  lastargs.putAll(args);
  analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
  stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. " +
                                                     "你好,这是一个测试的实例。" + "created2018by sls sun-li-shun SunLiShun"));
  expected =
      "[hello][this][is][a][test][case][你][好][这][是][一][个][测][试][的][实][例][created][2018][by][sls][sun][li][shun][sunlishun]";
  out = "";
  stream.reset();
  while (stream.incrementToken()) {
    out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
  }
  Assert.assertEquals(expected, out);
}