Java Code Examples for org.apache.lucene.index.IndexReader#docFreq()
The following examples show how to use
org.apache.lucene.index.IndexReader#docFreq() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: QueryTermExtractor.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Extracts all terms texts of a given Query into an array of WeightedTerms * * @param query Query to extract term texts from * @param reader used to compute IDF which can be used to a) score selected fragments better * b) use graded highlights eg changing intensity of font color * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based * @return an array of the terms used in a query, plus their weights. */ public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) { WeightedTerm[] terms=getTerms(query,false, fieldName); int totalNumDocs=reader.maxDoc(); for (int i = 0; i < terms.length; i++) { try { int docFreq=reader.docFreq(new Term(fieldName,terms[i].term)); //IDF algorithm taken from ClassicSimilarity class float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0); terms[i].weight*=idf; } catch (IOException e) { //ignore } } return terms; }
Example 2
Source File: LuceneHelper.java From dexter with Apache License 2.0 | 6 votes |
/** * Builds the TFIDF vector and its norm2 * * @param tfidf * - the vector containing for each term its TFIDF score, it will * be populated by this method * @param freq * - the vector containing for each term its frequency * @param field * - the field on which to compute the inverse document frequency * * @return the norm of the TFIDF vector * */ private double tfidfVector(Map<String, Double> tfidf, Map<String, Integer> freq, String field) { IndexReader reader = getReader(); double norm = 0; for (Map.Entry<String, Integer> entry : freq.entrySet()) { Term t = new Term(field, entry.getKey()); int df = 0; try { df = reader.docFreq(t); } catch (IOException e) { logger.error("computing tfidfVector ({}) ", e.toString()); System.exit(-1); } double idf = Math.log(collectionSize / (double) df + 1) / Math.log(2) + 1; double tfidfValue = entry.getValue() * idf; norm += tfidfValue * tfidfValue; tfidf.put(entry.getKey(), tfidfValue); } return Math.sqrt(norm); }
Example 3
Source File: PassageRankingComponent.java From wiseowl with MIT License | 5 votes |
protected float calculateWeight(Term term, IndexReader reader) throws IOException { //if a term is not in the index, then it's weight is 0 int docFrequency = reader.docFreq(term); if (docFrequency !=0) { log.warn("Term {} doc freq.{}",term.toString(),docFrequency); return 1.0f / docFrequency; } else { log.warn("Couldn't find doc freq for term {}", term); return 0f; } }
Example 4
Source File: WeightedSpanTermExtractor.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied * <code>IndexReader</code> to properly weight terms (for gradient highlighting). * * <p> * * @param query * that caused hit * @param tokenStream * of text to be highlighted * @param fieldName * restricts Term's used based on field name * @param reader * to use for scoring * @return Map of WeightedSpanTerms with quasi tf/idf scores * @throws IOException If there is a low-level I/O error */ public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName, IndexReader reader) throws IOException { if (fieldName != null) { this.fieldName = fieldName; } else { this.fieldName = null; } this.tokenStream = tokenStream; Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>(); extract(query, boost, terms); int totalNumDocs = reader.maxDoc(); Set<String> weightedTerms = terms.keySet(); Iterator<String> it = weightedTerms.iterator(); try { while (it.hasNext()) { WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); // IDF algorithm taken from ClassicSimilarity class float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0); weightedSpanTerm.weight *= idf; } } finally { IOUtils.close(internalReader); } return terms; }
Example 5
Source File: WordBreakSpellChecker.java From lucene-solr with Apache License 2.0 | 5 votes |
private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException { Term term = new Term(fieldname, text); int freq = ir.docFreq(term); SuggestWord word = new SuggestWord(); word.freq = freq; word.score = 1; word.string = text; return word; }
Example 6
Source File: RandomSamplingFacetsCollector.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Note: if you use a counting {@link Facets} implementation, you can amortize the * sampled counts by calling this method. Uses the {@link FacetsConfig} and * the {@link IndexSearcher} to determine the upper bound for each facet value. */ public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException { if (res == null || totalHits <= sampleSize) { return res; } LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length]; IndexReader reader = searcher.getIndexReader(); DimConfig dimConfig = config.getDimConfig(res.dim); // +2 to prepend dimension, append child label String[] childPath = new String[res.path.length + 2]; childPath[0] = res.dim; System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse for (int i = 0; i < res.labelValues.length; i++) { childPath[res.path.length + 1] = res.labelValues[i].label; String fullPath = FacetsConfig.pathToString(childPath, childPath.length); int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath)); int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate); correctedCount = Math.min(max, correctedCount); fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount); } // cap the total count on the total number of non-deleted documents in the reader int correctedTotalCount = res.value.intValue(); if (correctedTotalCount > 0) { correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate)); } return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount); }
Example 7
Source File: TermQuery.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
public Explanation explain(IndexReader reader, int doc) throws IOException { ComplexExplanation result = new ComplexExplanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); Explanation idfExpl = new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) + ", numDocs=" + reader.numDocs() + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); Explanation boostExpl = new Explanation(getBoost(), "boost"); if (getBoost() != 1.0f) queryExpl.addDetail(boostExpl); queryExpl.addDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * idfExpl.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight String field = term.field(); ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.setDescription("fieldWeight("+term+" in "+doc+ "), product of:"); Explanation tfExpl = scorer(reader).explain(doc); fieldExpl.addDetail(tfExpl); fieldExpl.addDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); float fieldNorm = fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); fieldExpl.setValue(tfExpl.getValue() * idfExpl.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); result.setMatch(fieldExpl.getMatch()); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) return fieldExpl; return result; }
Example 8
Source File: WordBreakSpellChecker.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * <p> * Generate suggestions by breaking the passed-in term into multiple words. * The scores returned are equal to the number of word breaks needed so a * lower score is generally preferred over a higher score. * </p> * * @param suggestMode * - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX} * @param sortMethod * - default = * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} * @return one or more arrays of words formed by breaking up the original term * @throws IOException If there is a low-level I/O error. */ public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode, BreakSuggestionSortMethod sortMethod) throws IOException { if (maxSuggestions < 1) { return new SuggestWord[0][0]; } if (suggestMode == null) { suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; } if (sortMethod == null) { sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator() : new LengthThenSumFreqComparator(); Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>( queueInitialCapacity, queueComparator); int origFreq = ir.docFreq(term); if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { return new SuggestWord[0][]; } int useMinSuggestionFrequency = minSuggestionFrequency; if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) { useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq); } generateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0, sortMethod); SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][]; for (int i = suggestions.size() - 1; i >= 0; i--) { suggestionArray[i] = suggestions.remove().suggestWords; } return suggestionArray; }
Example 9
Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0 | 4 votes |
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<>(); docNL.add(field, fieldNL); BytesRef text; PostingsEnum dpEnum = null; while((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } int dpEnumFlags = 0; dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0; //payloads require offsets dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0; dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0; dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); boolean atNextDoc = false; if (dpEnum != null) { dpEnum.nextDoc(); atNextDoc = true; } if (atNextDoc && dpEnumFlags != 0) { NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; NamedList<String> thePayloads = null; for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (fieldOptions.positions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1; if (startOffset >= 0) { if (theOffsets == null) { theOffsets = new NamedList<>(); termInfo.add("offsets", theOffsets); } theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null; if (payload != null) { if (thePayloads == null) { thePayloads = new NamedList<>(); termInfo.add("payloads", thePayloads); } thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length)); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }
Example 10
Source File: IndexManager.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value) throws IOException { return reader.docFreq(getTerm(columnFamily, columnName, value)); }