org.apache.lucene.util.IntsRef Java Examples

The following examples show how to use org.apache.lucene.util.IntsRef. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatawaveFieldIndexListIteratorJexl.java    From datawave with Apache License 2.0 6 votes vote down vote up
public static FST<?> getFST(SortedSet<String> values) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    // The builder options with defaults
    FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1;
    int minSuffixCount1 = 0;
    int minSuffixCount2 = 0;
    boolean doShareSuffix = true;
    boolean doShareNonSingletonNodes = true;
    int shareMaxTailLength = Integer.MAX_VALUE;
    
    boolean allowArrayArcs = true;
    int bytesPageBits = 15;
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    
    // create the FST from the values
    org.apache.lucene.util.fst.Builder<Object> fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2,
                    doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits);
    
    for (String value : values) {
        Util.toUTF16(value, irBuilder);
        final IntsRef scratchInt = irBuilder.get();
        fstBuilder.add(scratchInt, outputs.getNoOutput());
    }
    return fstBuilder.finish();
}
 
Example #2
Source File: TaxonomyFacetCounts.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
  IntsRef scratch  = new IntsRef();
  for(MatchingDocs hits : matchingDocs) {
    OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
    DocIdSetIterator docs = hits.bits.iterator();
    
    int doc;
    while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      ords.get(doc, scratch);
      for(int i=0;i<scratch.length;i++) {
        increment(scratch.ints[scratch.offset+i]);
      }
    }
  }

  rollup();
}
 
Example #3
Source File: TestFSTsMisc.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void testRandomWords(int maxNumWords, int numIter) throws IOException {
  Random random = new Random(random().nextLong());
  for(int iter=0;iter<numIter;iter++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter " + iter);
    }
    for(int inputMode=0;inputMode<2;inputMode++) {
      final int numWords = random.nextInt(maxNumWords+1);
      Set<IntsRef> termsSet = new HashSet<>();
      IntsRef[] terms = new IntsRef[numWords];
      while(termsSet.size() < numWords) {
        final String term = getRandomString(random);
        termsSet.add(toIntsRef(term, inputMode));
      }
      doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
    }
  }
}
 
Example #4
Source File: TestDictionary.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCompressedBeforeSetDictionary() throws Exception {
  InputStream affixStream = getClass().getResourceAsStream("compressed-before-set.aff");
  InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
  Directory tempDir = getDirectory();

  Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
  assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
  assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
  IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
  BytesRef ref = new BytesRef();
  dictionary.flagLookup.get(ordList.ints[0], ref);
  char flags[] = Dictionary.decodeFlags(ref);
  assertEquals(1, flags.length);
  
  affixStream.close();
  dictStream.close();
  tempDir.close();
}
 
Example #5
Source File: ContextQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void setInnerWeight(IntsRef ref, int offset) {
  IntsRefBuilder refBuilder = new IntsRefBuilder();
  for (int i = offset; i < ref.length; i++) {
    if (ref.ints[ref.offset + i] == ContextSuggestField.CONTEXT_SEPARATOR) {
      if (i > 0) {
        refBuilder.copyInts(ref.ints, ref.offset, i);
        currentContext = Util.toBytesRef(refBuilder.get(), scratch).utf8ToString();
      } else {
        currentContext = null;
      }
      ref.offset = ++i;
      assert ref.offset < ref.length : "input should not end with the context separator";
      if (ref.ints[i] == ConcatenateGraphFilter.SEP_LABEL) {
        ref.offset++;
        assert ref.offset < ref.length : "input should not end with a context separator followed by SEP_LABEL";
      }
      ref.length = ref.length - ref.offset;
      refBuilder.copyInts(ref.ints, ref.offset, ref.length);
      innerWeight.setNextMatch(refBuilder.get());
      return;
    }
  }
}
 
Example #6
Source File: ContextQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected void setNextMatch(final IntsRef pathPrefix) {
  IntsRef ref = pathPrefix.clone();

  // check if the pathPrefix matches any
  // defined context, longer context first
  for (int contextLength : contextLengths) {
    if (contextLength > pathPrefix.length) {
      continue;
    }
    ref.length = contextLength;
    if (contextMap.containsKey(ref)) {
      currentBoost = contextMap.get(ref);
      ref.length = pathPrefix.length;
      setInnerWeight(ref, contextLength);
      return;
    }
  }
  // unknown context
  ref.length = pathPrefix.length;
  currentBoost = 0f;
  setInnerWeight(ref, 0);
}
 
Example #7
Source File: IntSequenceOutputs.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public IntsRef subtract(IntsRef output, IntsRef inc) {
  assert output != null;
  assert inc != null;
  if (inc == NO_OUTPUT) {
    // no prefix removed
    return output;
  } else if (inc.length == output.length) {
    // entire output removed
    return NO_OUTPUT;
  } else {
    assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
    assert inc.length > 0;
    return new IntsRef(output.ints, output.offset + inc.length, output.length-inc.length);
  }
}
 
Example #8
Source File: ContextQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}
 
Example #9
Source File: IntSequenceOutputs.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public IntsRef add(IntsRef prefix, IntsRef output) {
  assert prefix != null;
  assert output != null;
  if (prefix == NO_OUTPUT) {
    return output;
  } else if (output == NO_OUTPUT) {
    return prefix;
  } else {
    assert prefix.length > 0;
    assert output.length > 0;
    IntsRef result = new IntsRef(prefix.length + output.length);
    System.arraycopy(prefix.ints, prefix.offset, result.ints, 0, prefix.length);
    System.arraycopy(output.ints, output.offset, result.ints, prefix.length, output.length);
    result.length = prefix.length + output.length;
    return result;
  }
}
 
Example #10
Source File: TokenInfoDictionaryTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testPut() throws Exception {
  TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
                                             // "large" id
                                             "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
  IntsRef wordIdRef = new IntsRefBuilder().get();

  dict.lookupWordIds(0, wordIdRef);
  int wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(5000, dict.getLeftId(wordId));
  assertEquals(5000, dict.getRightId(wordId));
  assertEquals(3, dict.getWordCost(wordId));

  dict.lookupWordIds(1, wordIdRef);
  wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(1, dict.getLeftId(wordId));
  assertEquals(1, dict.getRightId(wordId));
  assertEquals(2, dict.getWordCost(wordId));
}
 
Example #11
Source File: FuzzyCompletionQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected void setNextMatch(IntsRef pathPrefix) {
  // NOTE: the last letter of the matched prefix for the exact
  // match never makes it through here
  // so an exact match and a match with only a edit at the
  // end is boosted the same
  int maxCount = 0;
  for (IntsRef ref : refs) {
    int minLength = Math.min(ref.length, pathPrefix.length);
    int count = 0;
    for (int i = 0; i < minLength; i++) {
      if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) {
        count++;
      } else {
        break;
      }
    }
    maxCount = Math.max(maxCount, count);
  }
  currentBoost = maxCount;
}
 
Example #12
Source File: TestDictionary.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCompressedDictionary() throws Exception {
  InputStream affixStream = getClass().getResourceAsStream("compressed.aff");
  InputStream dictStream = getClass().getResourceAsStream("compressed.dic");

  Directory tempDir = getDirectory();
  Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
  assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
  assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
  IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
  BytesRef ref = new BytesRef();
  dictionary.flagLookup.get(ordList.ints[0], ref);
  char flags[] = Dictionary.decodeFlags(ref);
  assertEquals(1, flags.length);
  
  affixStream.close();
  dictStream.close();
  tempDir.close();
}
 
Example #13
Source File: TokenInfoDictionaryTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testPut() throws Exception {
  TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
      // "large" id
      "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
  IntsRef wordIdRef = new IntsRefBuilder().get();

  dict.lookupWordIds(0, wordIdRef);
  int wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(1, dict.getLeftId(wordId));
  assertEquals(1, dict.getRightId(wordId));
  assertEquals(2, dict.getWordCost(wordId));

  dict.lookupWordIds(1, wordIdRef);
  wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(5000, dict.getLeftId(wordId));
  assertEquals(5000, dict.getRightId(wordId));
  assertEquals(3, dict.getWordCost(wordId));
}
 
Example #14
Source File: Util.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Looks up the output for this input, or null if the
 *  input is not accepted. */
public static<T> T get(FST<T> fst, IntsRef input) throws IOException {

  // TODO: would be nice not to alloc this on every lookup
  final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

  final BytesReader fstReader = fst.getBytesReader();

  // Accumulate output as we go
  T output = fst.outputs.getNoOutput();
  for(int i=0;i<input.length;i++) {
    if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
      return null;
    }
    output = fst.outputs.add(output, arc.output());
  }

  if (arc.isFinal()) {
    return fst.outputs.add(output, arc.nextFinalOutput());
  } else {
    return null;
  }
}
 
Example #15
Source File: LimitedFiniteStringsIteratorTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSize() {
  Automaton a = Operations.union(Automata.makeString("foo"), Automata.makeString("bar"));
  LimitedFiniteStringsIterator iterator = new LimitedFiniteStringsIterator(a, -1);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  assertEquals(2, iterator.size());
}
 
Example #16
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Just takes unsigned byte values from the BytesRef and
 *  converts into an IntsRef. */
public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) {
  scratch.clear();
  for(int i=0;i<input.length;i++) {
    scratch.append(input.bytes[i+input.offset] & 0xFF);
  }
  return scratch.get();
}
 
Example #17
Source File: TestAutomaton.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testConcatEmpty() throws Exception {
  // If you concat empty automaton to anything the result should still be empty:
  Automaton a = Operations.concatenate(Automata.makeEmpty(),
                                                      Automata.makeString("foo"));
  assertEquals(new HashSet<IntsRef>(), TestOperations.getFiniteStrings(a));

  a = Operations.concatenate(Automata.makeString("foo"),
                                       Automata.makeEmpty());
  assertEquals(new HashSet<IntsRef>(), TestOperations.getFiniteStrings(a));
}
 
Example #18
Source File: IntSequenceOutputs.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public IntsRef read(DataInput in) throws IOException {
  final int len = in.readVInt();
  if (len == 0) {
    return NO_OUTPUT;
  } else {
    final IntsRef output = new IntsRef(len);
    for(int idx=0;idx<len;idx++) {
      output.ints[idx] = in.readVInt();
    }
    output.length = len;
    return output;
  }
}
 
Example #19
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Reverse lookup (lookup by output instead of by input),
 *  in the special case when your FSTs outputs are
 *  strictly ascending.  This locates the input/output
 *  pair where the output is equal to the target, and will
 *  return null if that output does not exist.
 *
 *  <p>NOTE: this only works with {@code FST<Long>}, only
 *  works when the outputs are ascending in order with
 *  the inputs.
 *  For example, simple ordinals (0, 1,
 *  2, ...), or file offsets (when appending to a file)
 *  fit this. */
@Deprecated
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {

  final BytesReader in = fst.getBytesReader();

  // TODO: would be nice not to alloc this on every lookup
  FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
  
  FST.Arc<Long> scratchArc = new FST.Arc<>();

  final IntsRefBuilder result = new IntsRefBuilder();
  return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
 
Example #20
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static BytesRef toBytesRef(IntsRef ir) {
  BytesRef br = new BytesRef(ir.length);
  for(int i=0;i<ir.length;i++) {
    int x = ir.ints[ir.offset+i];
    assert x >= 0 && x <= 255;
    br.bytes[i] = (byte) x;
  }
  br.length = ir.length;
  return br;
}
 
Example #21
Source File: IntsRefFSTEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Seeks to smallest term that's &gt;= target. */
public InputOutput<T> seekCeil(IntsRef target) throws IOException {
  this.target = target;
  targetLength = target.length;
  super.doSeekCeil();
  return setResult();
}
 
Example #22
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}
 
Example #23
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Decodes the Unicode codepoints from the provided
 *  char[] and places them in the provided scratch
 *  IntsRef, which must not be null, returning it. */
public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder scratch) {
  int charIdx = offset;
  int intIdx = 0;
  final int charLimit = offset + length;
  while(charIdx < charLimit) {
    scratch.grow(intIdx+1);
    final int utf32 = Character.codePointAt(s, charIdx, charLimit);
    scratch.setIntAt(intIdx, utf32);
    charIdx += Character.charCount(utf32);
    intIdx++;
  }
  scratch.setLength(intIdx);
  return scratch.get();
}
 
Example #24
Source File: AutomatonTestUtil.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the strings that can be produced from the given state, or
 * false if more than <code>limit</code> strings are found. 
 * <code>limit</code>&lt;0 means "infinite".
 */
private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates, 
    HashSet<IntsRef> strings, IntsRefBuilder path, int limit) {
  pathstates.add(s);
  Transition t = new Transition();
  int count = a.initTransition(s, t);
  for (int i=0;i<count;i++) {
    a.getNextTransition(t);
    if (pathstates.contains(t.dest)) {
      return false;
    }
    for (int n = t.min; n <= t.max; n++) {
      path.append(n);
      if (a.isAccept(t.dest)) {
        strings.add(path.toIntsRef());
        if (limit >= 0 && strings.size() > limit) {
          return false;
        }
      }
      if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) {
        return false;
      }
      path.setLength(path.length() - 1);
    }
  }
  pathstates.remove(s);
  return true;
}
 
Example #25
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}
 
Example #26
Source File: TestFuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static IntsRef toIntsRef(String s) {
  IntsRef ref = new IntsRef(s.length()); // worst case
  int utf16Len = s.length();
  for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
    cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
  }
  return ref;
}
 
Example #27
Source File: IntsRefFSTEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Seeks to exactly this term, returning null if the term
 *  doesn't exist.  This is faster than using {@link
 *  #seekFloor} or {@link #seekCeil} because it
 *  short-circuits as soon the match is not found. */
public InputOutput<T> seekExact(IntsRef target) throws IOException {
  this.target = target;
  targetLength = target.length;
  if (doSeekExact()) {
    assert upto == 1+target.length;
    return setResult();
  } else {
    return null;
  }
}
 
Example #28
Source File: TestAutomaton.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertMatches(Automaton a, String... strings) {
  Set<IntsRef> expected = new HashSet<>();
  for(String s : strings) {
    IntsRefBuilder ints = new IntsRefBuilder();
    expected.add(Util.toUTF32(s, ints));
  }

  assertEquals(expected, TestOperations.getFiniteStrings(
      Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES)));
}
 
Example #29
Source File: ConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}
 
Example #30
Source File: FiniteStringsIteratorTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Check that strings the automaton returns are as expected.
 *
 * @param automaton Automaton.
 * @param actual Strings generated by automaton.
 */
private void assertFiniteStringsRecursive(Automaton automaton, List<IntsRef> actual) {
  Set<IntsRef> expected = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
  // Check that no string is emitted twice.
  assertEquals(expected.size(), actual.size());
  assertEquals(expected, new HashSet<>(actual));
}