Java Code Examples for org.apache.lucene.util.BytesRef#utf8ToString()

The following examples show how to use org.apache.lucene.util.BytesRef#utf8ToString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}

Example 2

Source File: AbstractFeatureBuilder.java From jate with GNU Lesser General Public License v3.0

6 votes

protected Set<String> getUniqueWords() throws JATEException, IOException {
    Terms ngramInfo = SolrUtil.getTermVector(properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);

    TermsEnum termsEnum = ngramInfo.iterator();
    Set<String> allWords = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        String termStr=t.utf8ToString();
        if(!termStr.contains(" "))
            allWords.add(termStr);
    }
    if(allWords.size()==0)
        throw new JATEException("MWEMetadata are required on 'Words', however there are no single-token lexical units in the "+
        properties.getSolrFieldNameJATENGramInfo()+" field. Check to see if your analyzer pipeline outputs uni-grams");
    return allWords;
}

Example 3

Source File: SrndTruncQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  int prefixLength = prefix.length();
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    Matcher matcher = pattern.matcher("");
    try {
      TermsEnum termsEnum = terms.iterator();

      TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
      BytesRef text;
      if (status == TermsEnum.SeekStatus.FOUND) {
        text = prefixRef;
      } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
        text = termsEnum.term();
      } else {
        text = null;
      }

      while(text != null) {
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          String textString = text.utf8ToString();
          matcher.reset(textString.substring(prefixLength));
          if (matcher.matches()) {
            mtv.visitMatchingTerm(new Term(fieldName, textString));
          }
        } else {
          break;
        }
        text = termsEnum.next();
      }
    } finally {
      matcher.reset();
    }
  }
}

Example 4

Source File: TestTermVectorsReader.java From lucene-solr with Apache License 2.0

5 votes

public void testOffsetReader() throws IOException {
  TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
  Terms vector = reader.get(0).terms(testFields[0]);
  assertNotNull(vector);
  TermsEnum termsEnum = vector.iterator();
  assertNotNull(termsEnum);
  assertEquals(testTerms.length, vector.size());
  PostingsEnum dpEnum = null;
  for (int i = 0; i < testTerms.length; i++) {
    final BytesRef text = termsEnum.next();
    assertNotNull(text);
    String term = text.utf8ToString();
    assertEquals(testTerms[i], term);

    dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
    assertNotNull(dpEnum);
    assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(dpEnum.freq(), positions[i].length);
    for (int j = 0; j < positions[i].length; j++) {
      assertEquals(positions[i][j], dpEnum.nextPosition());
    }
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

    dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
    assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertNotNull(dpEnum);
    assertEquals(dpEnum.freq(), positions[i].length);
    for (int j = 0; j < positions[i].length; j++) {
      assertEquals(positions[i][j], dpEnum.nextPosition());
      assertEquals(j*10, dpEnum.startOffset());
      assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
    }
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
  }
  reader.close();
}

Example 5

Source File: IndexManager.java From incubator-retired-blur with Apache License 2.0

5 votes

private static String convert(FieldTypeDefinition typeDef, BytesRef currentTermText) {
  if (typeDef != null) {
    String readTerm = typeDef.readTerm(currentTermText);
    if (readTerm != null) {
      return readTerm;
    }
  }
  return currentTermText.utf8ToString();
}

Example 6

Source File: IDVersionSegmentTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings("unused")
static String brToString(BytesRef b) {
  try {
    return b.utf8ToString() + " " + b;
  } catch (Throwable t) {
    // If BytesRef isn't actually UTF8, or it's eg a
    // prefix of UTF8 that ends mid-unicode-char, we
    // fallback to hex:
    return b.toString();
  }
}

Example 7

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

5 votes

private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}

Example 8

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

5 votes

private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
  TermsEnum terms = MultiTerms.getTerms(r, "f").iterator();

  BytesRefBuilder last = new BytesRefBuilder();

  Set<String> seenTerms = new HashSet<>();

  while(true) {
    final BytesRef term = terms.next();
    if (term == null) {
      break;
    }

    assertTrue(last.get().compareTo(term) < 0);
    last.copyBytes(term);

    final String s = term.utf8ToString();
    assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
    seenTerms.add(s);
  }

  if (isTop) {
    assertTrue(allTerms.equals(seenTerms));
  }

  // Test seeking:
  Iterator<String> it = seenTerms.iterator();
  while(it.hasNext()) {
    BytesRef tr = new BytesRef(it.next());
    assertEquals("seek failed for term=" + termDesc(tr.utf8ToString()),
                 TermsEnum.SeekStatus.FOUND,
                 terms.seekCeil(tr));
  }
}

Example 9

Source File: VersionBlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings("unused")
static String brToString(BytesRef b) {
  try {
    return b.utf8ToString() + " " + b;
  } catch (Throwable t) {
    // If BytesRef isn't actually UTF8, or it's eg a
    // prefix of UTF8 that ends mid-unicode-char, we
    // fallback to hex:
    return b.toString();
  }
}

Example 10

Source File: TestTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

private String next(TermsEnum te) throws IOException {
  final BytesRef br = te.next();
  if (br == null) {
    return null;
  } else {
    return br.utf8ToString();
  }
}

Example 11

Source File: TermVectorFilteredLeafReader.java From lucene-solr with Apache License 2.0

5 votes

void moveToCurrentTerm() throws IOException {
  BytesRef currentTerm = in.term(); // from filteredTermsEnum
  boolean termInBothTermsEnum = baseTermsEnum.seekExact(currentTerm);

  if (!termInBothTermsEnum) {
    throw new IllegalStateException("Term vector term '" + currentTerm.utf8ToString() + "' does not appear in full index.");
  }
}

Example 12

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Example 13

Source File: SpatialTermQueryPrefixTreeStrategyFieldTypeDefinition.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}

Example 14

Source File: SpatialRecursivePrefixTreeStrategyFieldTypeDefinition.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}

Example 15

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}

Example 16

Source File: DefaultSortedSetDocValuesReaderState.java From lucene-solr with Apache License 2.0

4 votes

/** Creates this, pulling doc values from the specified
 *  field. */
public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
  this.field = field;
  this.reader = reader;

  // We need this to create thread-safe MultiSortedSetDV
  // per collector:
  SortedSetDocValues dv = getDocValues();
  if (dv == null) {
    throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
  }
  if (dv.getValueCount() > Integer.MAX_VALUE) {
    throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
  }
  valueCount = (int) dv.getValueCount();

  // TODO: we can make this more efficient if eg we can be
  // "involved" when OrdinalMap is being created?  Ie see
  // each term/ord it's assigning as it goes...
  String lastDim = null;
  int startOrd = -1;

  // TODO: this approach can work for full hierarchy?;
  // TaxoReader can't do this since ords are not in
  // "sorted order" ... but we should generalize this to
  // support arbitrary hierarchy:
  for(int ord=0;ord<valueCount;ord++) {
    final BytesRef term = dv.lookupOrd(ord);
    String[] components = FacetsConfig.stringToPath(term.utf8ToString());
    if (components.length != 2) {
      throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + term.utf8ToString());
    }
    if (!components[0].equals(lastDim)) {
      if (lastDim != null) {
        prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
      }
      startOrd = ord;
      lastDim = components[0];
    }
  }

  if (lastDim != null) {
    prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
  }
}

Example 17

Source File: AclReadFieldTypeDefinition.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}

Example 18

Source File: FieldLessFieldTypeDefinition.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
 public String readTerm(BytesRef byteRef) {
return byteRef.utf8ToString();
 }

Example 19

Source File: AclDiscoverFieldTypeDefinition.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public String readTerm(BytesRef byteRef) {
  return byteRef.utf8ToString();
}

Example 20

Source File: DocValueFormat.java From crate with Apache License 2.0

4 votes

@Override
public String format(BytesRef value) {
    return value.utf8ToString();
}