Java Code Examples for org.apache.lucene.util.CharsRefBuilder#copyUTF8Bytes()

The following examples show how to use org.apache.lucene.util.CharsRefBuilder#copyUTF8Bytes() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestIndexWriterUnicode.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testRandomUnicodeStrings() throws Throwable {
  char[] buffer = new char[20];
  char[] expected = new char[20];

  CharsRefBuilder utf16 = new CharsRefBuilder();

  int num = atLeast(10000);
  for (int iter = 0; iter < num; iter++) {
    boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);

    BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
    if (!hasIllegal) {
      byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
      assertEquals(b.length, utf8.length);
      for(int i=0;i<b.length;i++)
        assertEquals(b[i], utf8.bytes[i]);
    }

    utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
    assertEquals(utf16.length(), 20);
    for(int i=0;i<20;i++)
      assertEquals(expected[i], utf16.charAt(i));
  }
}
 
Example 2
Source File: QueryAutoStopWordAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}
 
Example 3
Source File: TSTLookup.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  root = new TernaryTreeNode();

  // make sure it's sorted and the comparator uses UTF16 sort order
  iterator = new SortedInputIterator(tempDir, tempFileNamePrefix, iterator, utf8SortedAsUTF16SortOrder);
  count = 0;
  ArrayList<String> tokens = new ArrayList<>();
  ArrayList<Number> vals = new ArrayList<>();
  BytesRef spare;
  CharsRefBuilder charsSpare = new CharsRefBuilder();
  while ((spare = iterator.next()) != null) {
    charsSpare.copyUTF8Bytes(spare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(iterator.weight()));
    count++;
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
 
Example 4
Source File: TermVectorsResponse.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
 
Example 5
Source File: FSTCompletionLookup.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
  if (contexts != null) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  final List<Completion> completions;
  if (higherWeightsFirst) {
    completions = higherWeightsCompletion.lookup(key, num);
  } else {
    completions = normalCompletion.lookup(key, num);
  }
  
  final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
  CharsRefBuilder spare = new CharsRefBuilder();
  for (Completion c : completions) {
    spare.copyUTF8Bytes(c.utf8);
    results.add(new LookupResult(spare.toString(), c.bucket));
  }
  return results;
}
 
Example 6
Source File: JaspellLookup.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  count = 0;
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRefBuilder charsSpare = new CharsRefBuilder();

  while ((spare = iterator.next()) != null) {
    final long weight = iterator.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.copyUTF8Bytes(spare);
    trie.put(charsSpare.toString(), weight);
    count++;
  }
}
 
Example 7
Source File: MoreLikeThis.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
 
Example 8
Source File: FieldType.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Marshals a string-based field value.
 */
protected static Object marshalStringSortValue(Object value) {
  if (null == value) {
    return null;
  }
  CharsRefBuilder spare = new CharsRefBuilder();
  spare.copyUTF8Bytes((BytesRef)value);
  return spare.toString();
}
 
Example 9
Source File: TestIndexWriterUnicode.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }
 
Example 10
Source File: XMoreLikeThis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }
        
        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}
 
Example 11
Source File: PagedBytesReference.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public String toUtf8() {
    if (length() == 0) {
        return "";
    }

    byte[] bytes = toBytes();
    final CharsRefBuilder ref = new CharsRefBuilder();
    ref.copyUTF8Bytes(bytes, offset, length);
    return ref.toString();
}
 
Example 12
Source File: SimpleTextLiveDocsFormat.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private int parseIntAt(BytesRef bytes, int offset, CharsRefBuilder scratch) {
  scratch.copyUTF8Bytes(bytes.bytes, bytes.offset + offset, bytes.length-offset);
  return ArrayUtil.parseInt(scratch.chars(), 0, scratch.length());
}
 
Example 13
Source File: AlfrescoLukeRequestHandler.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 14
Source File: DirectSpellChecker.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRefBuilder spare) throws IOException {

  Terms terms = MultiTerms.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
  
  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  while ((candidateTerm = e.next()) != null) {
    // For FuzzyQuery, boost is the score:
    float score = e.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
      continue;
    }
    
    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm)) {
      continue;
    }
    
    int df = e.docFreq();
    
    // check docFreq if required
    if (df <= docfreq) {
      continue;
    }
    
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
    } else {
      spare.copyUTF8Bytes(candidateTerm);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }
    
    if (score < accuracy) {
      continue;
    }
    
    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = score;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }
    
  return stQueue;
}
 
Example 15
Source File: PhraseSuggester.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggestion, CharsRefBuilder spare, double cutoffScore) {
    spare.copyUTF8Bytes(suggestion.getText());
    return new PhraseSuggestion.Entry(new Text(spare.toString()), 0, spare.length(), cutoffScore);
}
 
Example 16
Source File: NoisyChannelSpellChecker.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException {
    spare.copyUTF8Bytes(query);
    return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars(), 0, spare.length()));
}
 
Example 17
Source File: LukeRequestHandler.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
    throws IOException {

  SolrParams params = req.getParams();
  final int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT );

  TopTermQueue tiq = new TopTermQueue(numTerms + 1);  // Something to collect the top N terms in.

  final CharsRefBuilder spare = new CharsRefBuilder();

  Terms terms = MultiTerms.getTerms(req.getSearcher().getIndexReader(), field);
  if (terms == null) {  // field does not exist
    return;
  }
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  int[] buckets = new int[HIST_ARRAY_SIZE];
  while ((text = termsEnum.next()) != null) {
    ++tiq.distinctTerms;
    int freq = termsEnum.docFreq();  // This calculation seems odd, but it gives the same results as it used to.
    int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
    buckets[slot] = buckets[slot] + 1;
    if (numTerms > 0 && freq > tiq.minFreq) {
      spare.copyUTF8Bytes(text);
      String t = spare.toString();

      tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
      if (tiq.size() > numTerms) { // if tiq full
        tiq.pop(); // remove lowest in tiq
        tiq.minFreq = tiq.getTopTermInfo().docFreq;
      }
    }
  }
  tiq.histogram.add(buckets);
  fieldMap.add("distinct", tiq.distinctTerms);

  // Include top terms
  fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

  // Add a histogram
  fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 18
Source File: FieldType.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Given an indexed term, append the human readable representation*/
public CharsRef indexedToReadable(BytesRef input, CharsRefBuilder output) {
  output.copyUTF8Bytes(input);
  return output.get();
}
 
Example 19
Source File: SuggestUtils.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
    spare.copyUTF8Bytes(toAnalyze);
    return analyze(analyzer, spare.get(), field, consumer);
}