Java Code Examples for org.apache.lucene.analysis.core.WhitespaceTokenizer#reset()
The following examples show how to use
org.apache.lucene.analysis.core.WhitespaceTokenizer#reset() .
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr File: XmlInterpolationTest.java License: Apache License 2.0 | 6 votes |
private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); }
Example 2
Source Project: gerbil File: DatasetAnalyzer.java License: GNU Affero General Public License v3.0 | 6 votes |
private int countTokensInText(String text) { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); int tokens = 0; try { tokenizer.reset(); while (tokenizer.incrementToken()) { ++tokens; } } catch (Exception e) { LOGGER.error("Error while tokenizing text. Returning.", e); } finally { IOUtils.closeQuietly(tokenizer); } return tokens; }
Example 3
Source Project: SolrTextTagger File: XmlInterpolationTest.java License: Apache License 2.0 | 6 votes |
private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); }
Example 4
Source Project: lucene-solr File: XmlInterpolationTest.java License: Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example 5
Source Project: SolrTextTagger File: XmlInterpolationTest.java License: Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example 6
Source Project: solr-researcher File: ReSearcherUtils.java License: Apache License 2.0 | 4 votes |
/** * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query. * * @param queryString . * @param tokens . * @return number of quotes in the query */ public static int tokenizeQueryString(String queryString, List<String> tokens) { int countOfQuotes = 0; try { // first tokenize words and treat each quote as a separate token Map<String,String> args = new HashMap<String, String>(); args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString()); WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args); WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); s.setReader(new StringReader(queryString)); s.reset(); while (true) { CharTermAttribute t = s.getAttribute(CharTermAttribute.class); if (t == null) { break; } String tokentText = new String(t.toString()); if (tokentText.equals("\"")) { tokens.add("\""); countOfQuotes++; } else if (tokentText.startsWith("\"")) { tokens.add("\""); countOfQuotes++; if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(1, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else { tokens.add(tokentText.substring(1)); } } else if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(0, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else if (!tokentText.trim().equals("")) { // take into account only if different than empty string tokens.add(tokentText); } if (!s.incrementToken()) { break; } } s.end(); s.close(); } catch (IOException e) { throw new RuntimeException(e); } return countOfQuotes; }