Java Code Examples for org.apache.lucene.util.BytesRef#utf8ToString()

The following examples show how to use org.apache.lucene.util.BytesRef#utf8ToString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadmeSimilarityCalculator.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}
 
Example 2
Source File: AbstractFeatureBuilder.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected Set<String> getUniqueWords() throws JATEException, IOException {
    Terms ngramInfo = SolrUtil.getTermVector(properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);

    TermsEnum termsEnum = ngramInfo.iterator();
    Set<String> allWords = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        String termStr=t.utf8ToString();
        if(!termStr.contains(" "))
            allWords.add(termStr);
    }
    if(allWords.size()==0)
        throw new JATEException("MWEMetadata are required on 'Words', however there are no single-token lexical units in the "+
        properties.getSolrFieldNameJATENGramInfo()+" field. Check to see if your analyzer pipeline outputs uni-grams");
    return allWords;
}
 
Example 3
Source File: SrndTruncQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  int prefixLength = prefix.length();
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    Matcher matcher = pattern.matcher("");
    try {
      TermsEnum termsEnum = terms.iterator();

      TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
      BytesRef text;
      if (status == TermsEnum.SeekStatus.FOUND) {
        text = prefixRef;
      } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
        text = termsEnum.term();
      } else {
        text = null;
      }

      while(text != null) {
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          String textString = text.utf8ToString();
          matcher.reset(textString.substring(prefixLength));
          if (matcher.matches()) {
            mtv.visitMatchingTerm(new Term(fieldName, textString));
          }
        } else {
          break;
        }
        text = termsEnum.next();
      }
    } finally {
      matcher.reset();
    }
  }
}
 
Example 4
Source File: TestTermVectorsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOffsetReader() throws IOException {
  TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
  Terms vector = reader.get(0).terms(testFields[0]);
  assertNotNull(vector);
  TermsEnum termsEnum = vector.iterator();
  assertNotNull(termsEnum);
  assertEquals(testTerms.length, vector.size());
  PostingsEnum dpEnum = null;
  for (int i = 0; i < testTerms.length; i++) {
    final BytesRef text = termsEnum.next();
    assertNotNull(text);
    String term = text.utf8ToString();
    assertEquals(testTerms[i], term);

    dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
    assertNotNull(dpEnum);
    assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(dpEnum.freq(), positions[i].length);
    for (int j = 0; j < positions[i].length; j++) {
      assertEquals(positions[i][j], dpEnum.nextPosition());
    }
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

    dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
    assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertNotNull(dpEnum);
    assertEquals(dpEnum.freq(), positions[i].length);
    for (int j = 0; j < positions[i].length; j++) {
      assertEquals(positions[i][j], dpEnum.nextPosition());
      assertEquals(j*10, dpEnum.startOffset());
      assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
    }
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
  }
  reader.close();
}
 
Example 5
Source File: IndexManager.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private static String convert(FieldTypeDefinition typeDef, BytesRef currentTermText) {
  if (typeDef != null) {
    String readTerm = typeDef.readTerm(currentTermText);
    if (readTerm != null) {
      return readTerm;
    }
  }
  return currentTermText.utf8ToString();
}
 
Example 6
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
  try {
    return b.utf8ToString() + " " + b;
  } catch (Throwable t) {
    // If BytesRef isn't actually UTF8, or it's eg a
    // prefix of UTF8 that ends mid-unicode-char, we
    // fallback to hex:
    return b.toString();
  }
}
 
Example 7
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}
 
Example 8
Source File: TestIndexWriterUnicode.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
  TermsEnum terms = MultiTerms.getTerms(r, "f").iterator();

  BytesRefBuilder last = new BytesRefBuilder();

  Set<String> seenTerms = new HashSet<>();

  while(true) {
    final BytesRef term = terms.next();
    if (term == null) {
      break;
    }

    assertTrue(last.get().compareTo(term) < 0);
    last.copyBytes(term);

    final String s = term.utf8ToString();
    assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
    seenTerms.add(s);
  }

  if (isTop) {
    assertTrue(allTerms.equals(seenTerms));
  }

  // Test seeking:
  Iterator<String> it = seenTerms.iterator();
  while(it.hasNext()) {
    BytesRef tr = new BytesRef(it.next());
    assertEquals("seek failed for term=" + termDesc(tr.utf8ToString()),
                 TermsEnum.SeekStatus.FOUND,
                 terms.seekCeil(tr));
  }
}
 
Example 9
Source File: VersionBlockTreeTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
  try {
    return b.utf8ToString() + " " + b;
  } catch (Throwable t) {
    // If BytesRef isn't actually UTF8, or it's eg a
    // prefix of UTF8 that ends mid-unicode-char, we
    // fallback to hex:
    return b.toString();
  }
}
 
Example 10
Source File: TestTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private String next(TermsEnum te) throws IOException {
  final BytesRef br = te.next();
  if (br == null) {
    return null;
  } else {
    return br.utf8ToString();
  }
}
 
Example 11
Source File: TermVectorFilteredLeafReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
void moveToCurrentTerm() throws IOException {
  BytesRef currentTerm = in.term(); // from filteredTermsEnum
  boolean termInBothTermsEnum = baseTermsEnum.seekExact(currentTerm);

  if (!termInBothTermsEnum) {
    throw new IllegalStateException("Term vector term '" + currentTerm.utf8ToString() + "' does not appear in full index.");
  }
}
 
Example 12
Source File: FrequencyCtxWindowBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 4 votes vote down vote up
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 13
Source File: SpatialTermQueryPrefixTreeStrategyFieldTypeDefinition.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}
 
Example 14
Source File: SpatialRecursivePrefixTreeStrategyFieldTypeDefinition.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}
 
Example 15
Source File: TermVectorComponent.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}
 
Example 16
Source File: DefaultSortedSetDocValuesReaderState.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Creates this, pulling doc values from the specified
 *  field. */
public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
  this.field = field;
  this.reader = reader;

  // We need this to create thread-safe MultiSortedSetDV
  // per collector:
  SortedSetDocValues dv = getDocValues();
  if (dv == null) {
    throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
  }
  if (dv.getValueCount() > Integer.MAX_VALUE) {
    throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
  }
  valueCount = (int) dv.getValueCount();

  // TODO: we can make this more efficient if eg we can be
  // "involved" when OrdinalMap is being created?  Ie see
  // each term/ord it's assigning as it goes...
  String lastDim = null;
  int startOrd = -1;

  // TODO: this approach can work for full hierarchy?;
  // TaxoReader can't do this since ords are not in
  // "sorted order" ... but we should generalize this to
  // support arbitrary hierarchy:
  for(int ord=0;ord<valueCount;ord++) {
    final BytesRef term = dv.lookupOrd(ord);
    String[] components = FacetsConfig.stringToPath(term.utf8ToString());
    if (components.length != 2) {
      throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + term.utf8ToString());
    }
    if (!components[0].equals(lastDim)) {
      if (lastDim != null) {
        prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
      }
      startOrd = ord;
      lastDim = components[0];
    }
  }

  if (lastDim != null) {
    prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
  }
}
 
Example 17
Source File: AclReadFieldTypeDefinition.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}
 
Example 18
Source File: FieldLessFieldTypeDefinition.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
 public String readTerm(BytesRef byteRef) {
return byteRef.utf8ToString();
 }
 
Example 19
Source File: AclDiscoverFieldTypeDefinition.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}
 
Example 20
Source File: DocValueFormat.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
public String format(BytesRef value) {
    return value.utf8ToString();
}