org.apache.lucene.util.fst.FST.BytesReader Java Examples

The following examples show how to use org.apache.lucene.util.fst.FST.BytesReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FreeTextSuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,
                          BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {

  Long output = fst.outputs.getNoOutput();
  
  fst.getFirstArc(arc);
  
  byte[] bytes = scratch.bytes;
  int pos = scratch.offset;
  int end = pos + scratch.length;
  while (pos < end) {
    if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
      return null;
    } else {
      output = fst.outputs.add(output, arc.output());
    }
  }
  
  return output;
}
 
Example #2
Source File: StemmerOverrideFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
 */
public BytesRef get(char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) throws IOException {
  BytesRef pendingOutput = fst.outputs.getNoOutput();
  BytesRef matchOutput = null;
  int bufUpto = 0;
  fst.getFirstArc(scratchArc);
  while (bufUpto < bufferLen) {
    final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
    if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
      return null;
    }
    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
    bufUpto += Character.charCount(codePoint);
  }
  if (scratchArc.isFinal()) {
    matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
  }
  return matchOutput;
}
 
Example #3
Source File: Util.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Looks up the output for this input, or null if the
 *  input is not accepted. */
public static<T> T get(FST<T> fst, IntsRef input) throws IOException {

  // TODO: would be nice not to alloc this on every lookup
  final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

  final BytesReader fstReader = fst.getBytesReader();

  // Accumulate output as we go
  T output = fst.outputs.getNoOutput();
  for(int i=0;i<input.length;i++) {
    if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
      return null;
    }
    output = fst.outputs.add(output, arc.output());
  }

  if (arc.isFinal()) {
    return fst.outputs.add(output, arc.nextFinalOutput());
  } else {
    return null;
  }
}
 
Example #4
Source File: FstDecompounder.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Consume a maximal glue morpheme, if any, and consume the next word.
 */
private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder,
                               IntsRefBuilder maxPathsBuilder,
                               Deque<Chunk> chunks) throws IOException {
    FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>());
    BytesReader br = glueMorphemes.getBytesReader();
    for (int i = offset; i < utf32.length; i++) {
        int chr = utf32.ints[i];
        arc = glueMorphemes.findTargetArc(chr, arc, arc, br);
        if (arc == null) {
            break;
        }
        if (arc.isFinal()) {
            chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME));
            if (i + 1 < utf32.offset + utf32.length) {
                matchWord(utf32, i + 1, builder, maxPathsBuilder, chunks);
            }
            chunks.removeLast();
        }
    }
}
 
Example #5
Source File: StemmerOverrideFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, FST.Arc, FST.BytesReader)} method.
 */
public BytesReader getBytesReader() {
  if (fst == null) {
    return null;
  } else {
    return fst.getBytesReader();
  }
}
 
Example #6
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Reverse lookup (lookup by output instead of by input),
 *  in the special case when your FSTs outputs are
 *  strictly ascending.  This locates the input/output
 *  pair where the output is equal to the target, and will
 *  return null if that output does not exist.
 *
 *  <p>NOTE: this only works with {@code FST<Long>}, only
 *  works when the outputs are ascending in order with
 *  the inputs.
 *  For example, simple ordinals (0, 1,
 *  2, ...), or file offsets (when appending to a file)
 *  fit this. */
@Deprecated
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {

  final BytesReader in = fst.getBytesReader();

  // TODO: would be nice not to alloc this on every lookup
  FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
  
  FST.Arc<Long> scratchArc = new FST.Arc<>();

  final IntsRefBuilder result = new IntsRefBuilder();
  return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
 
Example #7
Source File: Util.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the first arc greater or equal than the given label into the provided
 * arc in place and returns it iff found, otherwise return <code>null</code>.
 * 
 * @param label the label to ceil on
 * @param fst the fst to operate on
 * @param follow the arc to follow reading the label from
 * @param arc the arc to read into in place
 * @param in the fst's {@link BytesReader}
 */
public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
  if (label == FST.END_LABEL) {
    return FST.readEndArc(follow, arc);
  }
  if (!FST.targetHasArcs(follow)) {
    return null;
  }
  fst.readFirstTargetArc(follow, arc, in);
  if (arc.bytesPerArc() != 0 && arc.label() != FST.END_LABEL) {
    if (arc.nodeFlags() == FST.ARCS_FOR_DIRECT_ADDRESSING) {
      // Fixed length arcs in a direct addressing node.
      int targetIndex = label - arc.label();
      if (targetIndex >= arc.numArcs()) {
        return null;
      } else if (targetIndex < 0) {
        return arc;
      } else {
        if (BitTable.isBitSet(targetIndex, arc, in)) {
          fst.readArcByDirectAddressing(arc, in, targetIndex);
          assert arc.label() == label;
        } else {
          int ceilIndex = BitTable.nextBitSet(targetIndex, arc, in);
          assert ceilIndex != -1;
          fst.readArcByDirectAddressing(arc, in, ceilIndex);
          assert arc.label() > label;
        }
        return arc;
      }
    }
    // Fixed length arcs in a binary search node.
    int idx = binarySearch(fst, arc, label);
    if (idx >= 0) {
      return fst.readArcByIndex(arc, in, idx);
    }
    idx = -1 - idx;
    if (idx == arc.numArcs()) {
      // DEAD END!
      return null;
    }
    return fst.readArcByIndex(arc, in , idx);
  }

  // Variable length arcs in a linear scan list,
  // or special arc with label == FST.END_LABEL.
  fst.readFirstRealTargetArc(follow.target(), arc, in);

  while (true) {
    // System.out.println("  non-bs cycle");
    // TODO: we should fix this code to not have to create
    // object for the output of every arc we scan... only
    // for the matching arc, if found
    if (arc.label() >= label) {
      // System.out.println("    found!");
      return arc;
    } else if (arc.isLast()) {
      return null;
    } else {
      fst.readNextRealArc(arc, in);
    }
  }
}
 
Example #8
Source File: TestFSTs.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testIllegallyModifyRootArc() throws Exception {
  assumeTrue("test relies on assertions", assertsAreEnabled);

  Set<BytesRef> terms = new HashSet<>();
  for(int i=0;i<100;i++) {
    String prefix = Character.toString((char) ('a' + i));
    terms.add(new BytesRef(prefix));
    if (prefix.equals("m") == false) {
      for(int j=0;j<20;j++) {
        // Make a big enough FST that the root cache will be created:
        String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
        terms.add(new BytesRef(prefix + suffix));
      }
    }
  }

  List<BytesRef> termsList = new ArrayList<>(terms);
  Collections.sort(termsList);

  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);

  IntsRefBuilder input = new IntsRefBuilder();
  for(BytesRef term : termsList) {
    Util.toIntsRef(term, input);
    fstCompiler.add(input.get(), term);
  }

  FST<BytesRef> fst = fstCompiler.compile();
  
  Arc<BytesRef> arc = new FST.Arc<>();
  fst.getFirstArc(arc);
  FST.BytesReader reader = fst.getBytesReader();
  arc = fst.findTargetArc((int) 'm', arc, arc, reader);
  assertNotNull(arc);
  assertEquals(new BytesRef("m"), arc.output());

  // NOTE: illegal:
  arc.output().length = 0;

  fst.getFirstArc(arc);
  try {
    arc = fst.findTargetArc((int) 'm', arc, arc, reader);
  } catch (AssertionError ae) {
    // expected
  }
}