org.apache.lucene.util.CharsRefBuilder#toString

Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}

Source File: PagedBytesReference.java From Elasticsearch with Apache License 2.0

5 votes

@Override
public String toUtf8() {
    if (length() == 0) {
        return "";
    }

    byte[] bytes = toBytes();
    final CharsRefBuilder ref = new CharsRefBuilder();
    ref.copyUTF8Bytes(bytes, offset, length);
    return ref.toString();
}

Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0

5 votes

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }
        
        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

Source File: DeleteUpdateCommand.java From lucene-solr with Apache License 2.0

5 votes

public String getId() {
  if (id == null && indexedId != null) {
    IndexSchema schema = req.getSchema();
    SchemaField sf = schema.getUniqueKeyField();
    if (sf != null) {
      CharsRefBuilder ref = new CharsRefBuilder();
      sf.getType().indexedToReadable(indexedId, ref);
      id = ref.toString();
    }
  }
  return id;
}

Source File: ExpandComponent.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"unchecked"})
private void addGroupSliceToOutputMap(FieldType fieldType, IntObjectHashMap<BytesRef> ordBytes,
                                      @SuppressWarnings({"rawtypes"})NamedList outMap, CharsRefBuilder charsRef, long groupValue, DocSlice slice) {
  if(fieldType instanceof StrField) {
    final BytesRef bytesRef = ordBytes.get((int)groupValue);
    fieldType.indexedToReadable(bytesRef, charsRef);
    String group = charsRef.toString();
    outMap.add(group, slice);
  } else {
    outMap.add(numericToString(fieldType, groupValue), slice);
  }
}

Source File: FieldType.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Marshals a string-based field value.
 */
protected static Object marshalStringSortValue(Object value) {
  if (null == value) {
    return null;
  }
  CharsRefBuilder spare = new CharsRefBuilder();
  spare.copyUTF8Bytes((BytesRef)value);
  return spare.toString();
}

Source File: PhraseSuggester.java From Elasticsearch with Apache License 2.0

4 votes

private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggestion, CharsRefBuilder spare, double cutoffScore) {
    spare.copyUTF8Bytes(suggestion.getText());
    return new PhraseSuggestion.Entry(new Text(spare.toString()), 0, spare.length(), cutoffScore);
}

Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}

Source File: DirectSpellChecker.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Provide spelling corrections based on several parameters.
 *
 * @param term The term to suggest spelling corrections for
 * @param numSug The maximum number of spelling corrections
 * @param ir The index reader to fetch the candidate spelling corrections from
 * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
 * @param editDistance The maximum edit distance candidates are allowed to have
 * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
 * @param spare a chars scratch
 * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
 * @throws IOException If I/O related errors occur
 */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
                                               float accuracy, final CharsRefBuilder spare) throws IOException {

  Terms terms = MultiTerms.getTerms(ir, term.field());
  if (terms == null) {
    return Collections.emptyList();
  }
  FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
  final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
  
  BytesRef queryTerm = new BytesRef(term.text());
  BytesRef candidateTerm;
  ScoreTerm st = new ScoreTerm();
  while ((candidateTerm = e.next()) != null) {
    // For FuzzyQuery, boost is the score:
    float score = e.getBoost();
    // ignore uncompetitive hits
    if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
      continue;
    }
    
    // ignore exact match of the same term
    if (queryTerm.bytesEquals(candidateTerm)) {
      continue;
    }
    
    int df = e.docFreq();
    
    // check docFreq if required
    if (df <= docfreq) {
      continue;
    }
    
    final String termAsString;
    if (distance == INTERNAL_LEVENSHTEIN) {
      // delay creating strings until the end
      termAsString = null;
    } else {
      spare.copyUTF8Bytes(candidateTerm);
      termAsString = spare.toString();
      score = distance.getDistance(term.text(), termAsString);
    }
    
    if (score < accuracy) {
      continue;
    }
    
    // add new entry in PQ
    st.term = BytesRef.deepCopyOf(candidateTerm);
    st.boost = score;
    st.docfreq = df;
    st.termAsString = termAsString;
    st.score = score;
    stQueue.offer(st);
    // possibly drop entries from queue
    st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
    e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
  }
    
  return stQueue;
}

Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
    throws IOException {

  SolrParams params = req.getParams();
  final int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT );

  TopTermQueue tiq = new TopTermQueue(numTerms + 1);  // Something to collect the top N terms in.

  final CharsRefBuilder spare = new CharsRefBuilder();

  Terms terms = MultiTerms.getTerms(req.getSearcher().getIndexReader(), field);
  if (terms == null) {  // field does not exist
    return;
  }
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  int[] buckets = new int[HIST_ARRAY_SIZE];
  while ((text = termsEnum.next()) != null) {
    ++tiq.distinctTerms;
    int freq = termsEnum.docFreq();  // This calculation seems odd, but it gives the same results as it used to.
    int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
    buckets[slot] = buckets[slot] + 1;
    if (numTerms > 0 && freq > tiq.minFreq) {
      spare.copyUTF8Bytes(text);
      String t = spare.toString();

      tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
      if (tiq.size() > numTerms) { // if tiq full
        tiq.pop(); // remove lowest in tiq
        tiq.minFreq = tiq.getTopTermInfo().docFreq;
      }
    }
  }
  tiq.histogram.add(buckets);
  fieldMap.add("distinct", tiq.distinctTerms);

  // Include top terms
  fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

  // Add a histogram
  fieldMap.add("histogram", tiq.histogram.toNamedList());
}

Java Code Examples for org.apache.lucene.util.CharsRefBuilder#toString()