Java Code Examples for org.apache.lucene.util.fst.FST#BytesReader

The following examples show how to use org.apache.lucene.util.fst.FST#BytesReader . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target();
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
 
Example 2
Source File: FreeTextSuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,
                          BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {

  Long output = fst.outputs.getNoOutput();
  
  fst.getFirstArc(arc);
  
  byte[] bytes = scratch.bytes;
  int pos = scratch.offset;
  int end = pos + scratch.length;
  while (pos < end) {
    if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
      return null;
    } else {
      output = fst.outputs.add(output, arc.output());
    }
  }
  
  return output;
}
 
Example 3
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Cache the root node's output arcs starting with completions with the
 * highest weights.
 */
@SuppressWarnings({"unchecked","rawtypes"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
  try {
    List<Arc<Object>> rootArcs = new ArrayList<>();
    Arc<Object> arc = automaton.getFirstArc(new Arc<>());
    FST.BytesReader fstReader = automaton.getBytesReader();
    automaton.readFirstTargetArc(arc, arc, fstReader);
    while (true) {
      rootArcs.add(new Arc<>().copyFrom(arc));
      if (arc.isLast()) break;
      automaton.readNextArc(arc, fstReader);
    }
    
    Collections.reverse(rootArcs); // we want highest weights first.
    return rootArcs.toArray(new Arc[rootArcs.size()]);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example 4
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Lookup words in text
 * @param chars text
 * @param off offset into text
 * @param len length of text
 * @return array of wordId
 */
public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
  List<Integer> result = new ArrayList<>();
  final FST.BytesReader fstReader = fst.getBytesReader();

  FST.Arc<Long> arc = new FST.Arc<>();
  int end = off + len;
  for (int startOffset = off; startOffset < end; startOffset++) {
    arc = fst.getFirstArc(arc);
    int output = 0;
    int remaining = end - startOffset;
    for (int i = 0; i < remaining; i++) {
      int ch = chars[startOffset+i];
      if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
        break; // continue to next position
      }
      output += arc.output().intValue();
      if (arc.isFinal()) {
        final int finalOutput = output + arc.nextFinalOutput().intValue();
        result.add(finalOutput);
      }
    }
  }
  return result;
}
 
Example 5
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the first exact match by traversing root arcs, starting from the
 * arc <code>rootArcIndex</code>.
 * 
 * @param rootArcIndex
 *          The first root arc index in {@link #rootArcs} to consider when
 *          matching.
 * 
 * @param utf8
 *          The sequence of utf8 bytes to follow.
 * 
 * @return Returns the bucket number of the match or <code>-1</code> if no
 *         match was found.
 */
private int getExactMatchStartingFromRootArc(
    int rootArcIndex, BytesRef utf8) {
  // Get the UTF-8 bytes representation of the input key.
  try {
    final FST.Arc<Object> scratch = new FST.Arc<>();
    FST.BytesReader fstReader = automaton.getBytesReader();
    for (; rootArcIndex < rootArcs.length; rootArcIndex++) {
      final FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
      final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
      
      // Descend into the automaton using the key as prefix.
      if (descendWithPrefix(arc, utf8)) {
        automaton.readFirstTargetArc(arc, arc, fstReader);
        if (arc.label() == FST.END_LABEL) {
          // Normalize prefix-encoded weight.
          return rootArc.label();
        }
      }
    }
  } catch (IOException e) {
    // Should never happen, but anyway.
    throw new RuntimeException(e);
  }
  
  // No match.
  return -1;
}
 
Example 6
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Descend along the path starting at <code>arc</code> and going through bytes
 * in the argument.
 * 
 * @param arc
 *          The starting arc. This argument is modified in-place.
 * @param utf8
 *          The term to descend along.
 * @return If <code>true</code>, <code>arc</code> will be set to the arc
 *         matching last byte of <code>term</code>. <code>false</code> is
 *         returned if no such prefix exists.
 */
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
    throws IOException {
  final int max = utf8.offset + utf8.length;
  // Cannot save as instance var since multiple threads
  // can use FSTCompletion at once...
  final FST.BytesReader fstReader = automaton.getBytesReader();
  for (int i = utf8.offset; i < max; i++) {
    if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) {
      // No matching prefixes, return an empty result.
      return false;
    }
  }
  return true;
}
 
Example 7
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes","unchecked"})
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
  FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)];
  FST.Arc<Long> firstArc = new FST.Arc<>();
  fst.getFirstArc(firstArc);
  FST.Arc<Long> arc = new FST.Arc<>();
  final FST.BytesReader fstReader = fst.getBytesReader();
  // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
  for (int i = 0; i < rootCache.length; i++) {
    if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
      rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
    }
  }
  return rootCache;
}
 
Example 8
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
  if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
    assert ch != FST.END_LABEL;
    final Arc<Long> result = rootCache[ch - 0x3040];
    if (result == null) {
      return null;
    } else {
      arc.copyFrom(result);
      return arc;
    }
  } else {
    return fst.findTargetArc(ch, follow, arc, fstReader);
  }
}
 
Example 9
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Lookup words in text
 * @param chars text
 * @param off offset into text
 * @param len length of text
 * @return array of {wordId, position, length}
 */
public int[][] lookup(char[] chars, int off, int len) throws IOException {
  // TODO: can we avoid this treemap/toIndexArray?
  TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
  boolean found = false; // true if we found any results

  final FST.BytesReader fstReader = fst.getBytesReader();

  FST.Arc<Long> arc = new FST.Arc<>();
  int end = off + len;
  for (int startOffset = off; startOffset < end; startOffset++) {
    arc = fst.getFirstArc(arc);
    int output = 0;
    int remaining = end - startOffset;
    for (int i = 0; i < remaining; i++) {
      int ch = chars[startOffset+i];
      if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
        break; // continue to next position
      }
      output += arc.output().intValue();
      if (arc.isFinal()) {
        final int finalOutput = output + arc.nextFinalOutput().intValue();
        result.put(startOffset-off, segmentations[finalOutput]);
        found = true;
      }
    }
  }
  
  return found ? toIndexArray(result) : EMPTY_RESULT;
}
 
Example 10
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes","unchecked"})
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
  FST.Arc<Long>[] rootCache = new FST.Arc[1+(cacheCeiling-0xAC00)];
  FST.Arc<Long> firstArc = new FST.Arc<>();
  fst.getFirstArc(firstArc);
  FST.Arc<Long> arc = new FST.Arc<>();
  final FST.BytesReader fstReader = fst.getBytesReader();
  // TODO: jump to AC00, readNextRealArc to ceiling? (just be careful we don't add bugs)
  for (int i = 0; i < rootCache.length; i++) {
    if (fst.findTargetArc(0xAC00 + i, firstArc, arc, fstReader) != null) {
      rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
    }
  }
  return rootCache;
}
 
Example 11
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
  if (useCache && ch >= 0xAC00 && ch <= cacheCeiling) {
    assert ch != FST.END_LABEL;
    final Arc<Long> result = rootCache[ch - 0xAC00];
    if (result == null) {
      return null;
    } else {
      arc.copyFrom(result);
      return arc;
    }
  } else {
    return fst.findTargetArc(ch, follow, arc, fstReader);
  }
}
 
Example 12
Source File: Dictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
  if (fst == null) {
    return null;
  }
  final FST.BytesReader bytesReader = fst.getBytesReader();
  final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
  // Accumulate output as we go
  final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
  IntsRef output = NO_OUTPUT;
  
  int l = offset + length;
  try {
    for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
      cp = Character.codePointAt(word, i, l);
      if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
        return null;
      } else if (arc.output() != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output());
      }
    }
    if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
      return null;
    } else if (arc.output() != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output());
    } else {
      return output;
    }
  } catch (IOException bogus) {
    throw new RuntimeException(bogus);
  }
}
 
Example 13
Source File: Dictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
  final FST.BytesReader bytesReader = fst.getBytesReader();
  final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
  final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
  
  // temporary stuff
  final FST.Arc<CharsRef> arc = new FST.Arc<>();
  int longestMatch;
  CharsRef longestOutput;
  
  for (int i = 0; i < sb.length(); i++) {
    arc.copyFrom(firstArc);
    CharsRef output = NO_OUTPUT;
    longestMatch = -1;
    longestOutput = null;
    
    for (int j = i; j < sb.length(); j++) {
      char ch = sb.charAt(j);
      if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
        break;
      } else {
        output = fst.outputs.add(output, arc.output());
      }
      if (arc.isFinal()) {
        longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
        longestMatch = j;
      }
    }
    
    if (longestMatch >= 0) {
      sb.delete(i, longestMatch+1);
      sb.insert(i, longestOutput);
      i += (longestOutput.length - 1);
    }
  }
}
 
Example 14
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public FST.BytesReader getBytesReader() {
  return fst.getBytesReader();
}
 
Example 15
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public FST.BytesReader getBytesReader() {
  return fst.getBytesReader();
}
 
Example 16
Source File: Sequences.java    From querqy with Apache License 2.0 3 votes vote down vote up
public void putTerm(Term term) throws IOException {

      appendToSequences(term);

      FST.Arc<BytesRef> scratchArc = new FST.Arc<>();
      fst.getFirstArc(scratchArc);
      BytesRef pendingOutput = fst.outputs.getNoOutput();

      FST.BytesReader fstReader = fst.getBytesReader();

      boolean ok = true;

      CharSequence termValue = term.getValue();

      for (int pos = 0, len = termValue.length(); ok && (pos < len);) {

         int codePoint = Character.codePointAt(termValue, pos);

         ok = null != fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader);

         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);

         pos += Character.charCount(codePoint);
      }
      if (ok) {

         List<Term> terms = Collections.singletonList(term);
         FST.Arc<BytesRef> arc = new FST.Arc<>();
         addSequences.add(new Sequence(arc.copyFrom(scratchArc), terms, pendingOutput));

         if (scratchArc.isFinal()) {
            addOutput(fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput));
         }

      }

   }