Java Code Examples for org.apache.lucene.analysis.tokenattributes.CharTermAttribute#toString()

The following examples show how to use org.apache.lucene.analysis.tokenattributes.CharTermAttribute#toString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Tokenizers.java    From ache with Apache License 2.0 6 votes vote down vote up
public List<String> tokenize(String cleanText) {
    try {
        TokenStream ts = analyzer.tokenStream("cleanText", cleanText);
        CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        List<String> tokens = new ArrayList<String>();
        while (ts.incrementToken()) {
            String token = cattr.toString();
            tokens.add(token);
        }
        ts.close();
        return tokens;
    } catch (IOException e) {
        throw new RuntimeException(
                "Shigle tokenization failed for string: " + cleanText, e);
    }
}
 
Example 2
Source File: MinHash.java    From minhash with Apache License 2.0 6 votes vote down vote up
/**
 * Calculates MinHash value.
 * 
 * @param analyzer analyzer to parse a text
 * @param text a target text
 * @return MinHash value
 * @throws IOException
 */
public static byte[] calculate(final Analyzer analyzer, final String text)
        throws IOException {
    byte[] value = null;
    try (TokenStream stream = analyzer.tokenStream("minhash", text)) {
        final CharTermAttribute termAtt = stream
                .addAttribute(CharTermAttribute.class);
        stream.reset();
        if (stream.incrementToken()) {
            final String minhashValue = termAtt.toString();
            value = BaseEncoding.base64().decode(minhashValue);
        }
        stream.end();
    }
    return value;
}
 
Example 3
Source File: TestStopAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopList() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
    assertNotNull(stream);
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  
    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
    }
    stream.end();
  }
  newStop.close();
}
 
Example 4
Source File: ShingleAnalyzerWrapperTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
  BooleanQuery.Builder q = new BooleanQuery.Builder();

  try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      String termText =  termAtt.toString();
      q.add(new TermQuery(new Term("content", termText)),
          BooleanClause.Occur.SHOULD);
    }
    ts.end();
  }

  ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
  int[] ranks = new int[] { 1, 2, 0 };
  compareRanks(hits, ranks);
}
 
Example 5
Source File: BooleanPerceptronClassifier.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public ClassificationResult<Boolean> assignClass(String text)
        throws IOException {
  Long output = 0L;
  try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
    CharTermAttribute charTermAttribute = tokenStream
            .addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
      String s = charTermAttribute.toString();
      Long d = Util.get(fst, new BytesRef(s));
      if (d != null) {
        output += d;
      }
    }
    tokenStream.end();
  }

  double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
  return new ClassificationResult<>(output >= bias, score);
}
 
Example 6
Source File: FeatureExtractorUtilities.java    From samantha with MIT License 6 votes vote down vote up
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
    TokenStream ts = analyzer.tokenStream(termField, text);
    CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
    Map<String, Integer> termFreq = new HashMap<>();
    try {
        ts.reset();
        while (ts.incrementToken()) {
            String term = cattr.toString();
            int cnt = termFreq.getOrDefault(
                    FeatureExtractorUtilities.composeKey(termField, term), 0);
            termFreq.put(term, cnt + 1);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        logger.error("{}", e.getMessage());
        throw new BadRequestException(e);
    }
    return termFreq;
}
 
Example 7
Source File: LuceneUtil.java    From jasperreports with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected String displayTokens(String text, String elementId) throws IOException {
	Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
	StringBuilder sb = new StringBuilder();
	sb.append(elementId).append(": ").append(text).append(": ");

	TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
	CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		int startOffset = offsetAttribute.startOffset();
		int endOffset = offsetAttribute.endOffset();
		String term = charTermAttribute.toString();
		sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
	}

	return sb.toString();
}
 
Example 8
Source File: AbstractSearchTest.java    From database with GNU General Public License v2.0 5 votes vote down vote up
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
	TokenStream s = a.tokenStream(null, new StringReader(text));
	int ix = 0;
	
	s.reset();
	
	while (s.incrementToken()) {
		final CharTermAttribute term = s.getAttribute(CharTermAttribute.class);
		final String word = term.toString();
		assertTrue(ix < expected.length);
		assertEquals(expected[ix++], word);
	}
	s.close();
	assertEquals(ix, expected.length);
}
 
Example 9
Source File: TextParseUtils.java    From SimpleTextSearch with MIT License 5 votes vote down vote up
public List<String> tokenize(String rawText) {

        List<String> retVal = new ArrayList<>();
        if (StringUtils.isEmpty(rawText)) {
            return retVal;
        }

        try (TokenStream ts = analyzer.tokenStream(null,rawText)) {
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                String str = term.toString();
                if (str == null) {
                    continue;
                }

                str = str.replaceAll("[^a-zA-Z ]", "");

                if (str.isEmpty()) {
                    continue;
                }

                retVal.add(str);
            }
            ts.end();
        }
        catch (IOException ex) {}


        return retVal;
    }
 
Example 10
Source File: XmlInterpolationTest.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 11
Source File: ChineseMatcher.java    From zxl with Apache License 2.0 5 votes vote down vote up
public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }
 
Example 12
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull final TokenStream stream,
        @Nonnull final List<Text> tokenResult, @Nonnull final List<Text> posResult)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        tokenResult.add(new Text(term));
        String pos = posAttr.getPartOfSpeech();
        posResult.add(new Text(pos));
    }
}
 
Example 13
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull final TokenStream stream,
        @Nonnull final List<Text> tokens) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        tokens.add(new Text(term));
    }
}
 
Example 14
Source File: SmartcnUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}
 
Example 15
Source File: XmlInterpolationTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 16
Source File: MoreLikeThis.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param perFieldTermFrequencies a Map of terms and their frequencies per field
 * @param fieldName Used by analyzer for any special per-field analysis
 */
private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
    throws IOException {
  if (analyzer == null) {
    throw new UnsupportedOperationException("To use MoreLikeThis without " +
        "term vectors, you must provide an Analyzer");
  }
  Map<String, Int> termFreqMap = perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>());
  try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
    int tokenCount = 0;
    // for every token
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      String word = termAtt.toString();
      tokenCount++;
      if (tokenCount > maxNumTokensParsed) {
        break;
      }
      if (isNoiseWord(word)) {
        continue;
      }

      // increment frequency
      Int cnt = termFreqMap.get(word);
      if (cnt == null) {
        termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
      } else {
        cnt.x += tfAtt.getTermFrequency();
      }
    }
    ts.end();
  }
}
 
Example 17
Source File: XMoreLikeThis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 */
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException("To use MoreLikeThis without " +
                "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }
            if (isSkipTerm(fieldName, word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    }
}
 
Example 18
Source File: ReSearcherUtils.java    From solr-researcher with Apache License 2.0 4 votes vote down vote up
/**
 * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query.
 * 
 * @param queryString .
 * @param tokens .
 * @return number of quotes in the query
 */
public static int tokenizeQueryString(String queryString, List<String> tokens) {
  int countOfQuotes = 0;
  
  try {
    // first tokenize words and treat each quote as a separate token
    Map<String,String> args = new HashMap<String, String>();
    args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString());
    WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args);
    
    WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    s.setReader(new StringReader(queryString));
    s.reset();
    
    while (true) {
      CharTermAttribute t = s.getAttribute(CharTermAttribute.class);

      if (t == null) {
        break;
      }
      
      String tokentText = new String(t.toString());
      
      if (tokentText.equals("\"")) {
        tokens.add("\"");
        countOfQuotes++;
      } else if (tokentText.startsWith("\"")) {
        tokens.add("\"");
        countOfQuotes++;
        
        if (tokentText.endsWith("\"")) {
          tokens.add(tokentText.substring(1, tokentText.length() - 1));
          tokens.add("\"");
          countOfQuotes++;
        } else {
          tokens.add(tokentText.substring(1));
        }
      } else if (tokentText.endsWith("\"")) {
        tokens.add(tokentText.substring(0, tokentText.length() - 1));
        tokens.add("\"");
        countOfQuotes++;
      } else if (!tokentText.trim().equals("")) {
        // take into account only if different than empty string
        tokens.add(tokentText);
      }
      
      if (!s.incrementToken()) {
        break;
      }
    }
    s.end();
    s.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return countOfQuotes;
}
 
Example 19
Source File: Zemberek2DeASCIIfyFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");

        Map<String, String> map = new HashMap<>();


        Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 20
Source File: Zemberek2StemFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }