Java Code Examples for org.apache.lucene.util.fst.FST#Arc

The following examples show how to use org.apache.lucene.util.fst.FST#Arc . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FstDecompounder.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Consume a maximal glue morpheme, if any, and consume the next word.
 */
private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder,
                               IntsRefBuilder maxPathsBuilder,
                               Deque<Chunk> chunks) throws IOException {
    FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>());
    BytesReader br = glueMorphemes.getBytesReader();
    for (int i = offset; i < utf32.length; i++) {
        int chr = utf32.ints[i];
        arc = glueMorphemes.findTargetArc(chr, arc, arc, br);
        if (arc == null) {
            break;
        }
        if (arc.isFinal()) {
            chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME));
            if (i + 1 < utf32.offset + utf32.length) {
                matchWord(utf32, i + 1, builder, maxPathsBuilder, chunks);
            }
            chunks.removeLast();
        }
    }
}
 
Example 2
Source File: OrdsSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc<Output> arc, Output frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  // System.out.println("    fpSeek=" + fpSeek);
  final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;

  // Must setFloorData before pushFrame in case pushFrame tries to rewind:
  if (f.isFloor) {
    f.termOrdOrig = frameData.startOrd;
    f.setFloorData(scratchReader, frameData.bytes);
  }

  pushFrame(arc, fpSeek, length, frameData.startOrd);

  return f;
}
 
Example 3
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Lookup words in text
 * @param chars text
 * @param off offset into text
 * @param len length of text
 * @return array of wordId
 */
public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
  List<Integer> result = new ArrayList<>();
  final FST.BytesReader fstReader = fst.getBytesReader();

  FST.Arc<Long> arc = new FST.Arc<>();
  int end = off + len;
  for (int startOffset = off; startOffset < end; startOffset++) {
    arc = fst.getFirstArc(arc);
    int output = 0;
    int remaining = end - startOffset;
    for (int i = 0; i < remaining; i++) {
      int ch = chars[startOffset+i];
      if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
        break; // continue to next position
      }
      output += arc.output().intValue();
      if (arc.isFinal()) {
        final int finalOutput = output + arc.nextFinalOutput().intValue();
        result.add(finalOutput);
      }
    }
  }
  return result;
}
 
Example 4
Source File: NGramSynonymTokenizer.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Override
public void reset() throws IOException {
    super.reset();
    block.setLength(0);
    prevToken = null;
    readBufferIndex = BUFFER_SIZE;
    readBufferLen = 0;
    ch = 0;
    blkStart = 0;
    nextBlkStart = 0;
    if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) {
        lastModified = synonymLoader.getLastModified();
        final SynonymMap map = synonymLoader.getSynonymMap();
        if (map != null) {
            synonymMap = map;
            fst = synonymMap.fst;
            if (fst == null) {
                throw new IllegalArgumentException("fst must be non-null");
            }
            fstReader = fst.getBytesReader();
            scratchArc = new FST.Arc<>();
            clearAttributes();
        }
    }
}
 
Example 5
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException {
  scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.maxIDVersion = Long.MAX_VALUE - frameData.output2;
  f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData.output1);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example 6
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Load frame for target arc(node) on fst, so that 
 *  arc.label &gt;= label and !fsa.reject(arc.label) */
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
  FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
  arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
  if (arc == null) {
    return null;
  }
  frame.fsaState = fsa.step(top.fsaState, arc.label());
  //if (TEST) System.out.println(" loadCeil frame="+frame);
  if (frame.fsaState == -1) {
    return loadNextFrame(top, frame);
  }
  frame.output = frame.fstArc.output();
  return frame;
}
 
Example 7
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
Frame pushFrame(FST.Arc<BytesRef> arc, long fp, int length) throws IOException {
  final Frame f = getFrame(1+currentFrame.ord);
  f.arc = arc;
  if (f.fpOrig == fp && f.nextEnt != -1) {
    //if (DEBUG) System.out.println("      push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
    if (f.prefix > targetBeforeCurrentLength) {
      f.rewind();
    } else {
      // if (DEBUG) {
      //   System.out.println("        skip rewind!");
      // }
    }
    assert length == f.prefix;
  } else {
    f.nextEnt = -1;
    f.prefix = length;
    f.state.termBlockOrd = 0;
    f.fpOrig = f.fp = fp;
    f.lastSubFP = -1;
    // if (DEBUG) {
    //   final int sav = term.length;
    //   term.length = length;
    //   System.out.println("      push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
    //   term.length = sav;
    // }
  }

  return f;
}
 
Example 8
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private FST.Arc<BytesRef> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
      new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<BytesRef>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example 9
Source File: SegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
  final SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example 10
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final Frame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example 11
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache, FST.BytesReader fstReader) throws IOException {
  if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
    assert ch != FST.END_LABEL;
    final Arc<Long> result = rootCache[ch - 0x3040];
    if (result == null) {
      return null;
    } else {
      arc.copyFrom(result);
      return arc;
    }
  } else {
    return fst.findTargetArc(ch, follow, arc, fstReader);
  }
}
 
Example 12
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes","unchecked"})
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
  FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)];
  FST.Arc<Long> firstArc = new FST.Arc<>();
  fst.getFirstArc(firstArc);
  FST.Arc<Long> arc = new FST.Arc<>();
  final FST.BytesReader fstReader = fst.getBytesReader();
  // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
  for (int i = 0; i < rootCache.length; i++) {
    if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) {
      rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
    }
  }
  return rootCache;
}
 
Example 13
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, long fp, int length) throws IOException {
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.arc = arc;
  if (f.fpOrig == fp && f.nextEnt != -1) {
    //if (DEBUG) System.out.println("      push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
    if (f.prefix > targetBeforeCurrentLength) {
      f.rewind();
    } else {
      // if (DEBUG) {
      //   System.out.println("        skip rewind!");
      // }
    }
    assert length == f.prefix;
  } else {
    f.nextEnt = -1;
    f.prefix = length;
    f.state.termBlockOrd = 0;
    f.fpOrig = f.fp = fp;
    f.lastSubFP = -1;
    // if (DEBUG) {
    //   final int sav = term.length;
    //   term.length = length;
    //   System.out.println("      push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
    //   term.length = sav;
    // }
  }

  return f;
}
 
Example 14
Source File: SegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST.Arc<BytesRef> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
    new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example 15
Source File: SynonymFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * @param input input tokenstream
 * @param synonyms synonym map
 * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
 *                   Note, if you set this to true, it's your responsibility to lowercase
 *                   the input entries when you create the {@link SynonymMap}
 */
public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
  super(input);
  this.synonyms = synonyms;
  this.ignoreCase = ignoreCase;
  this.fst = synonyms.fst;
  if (fst == null) {
    throw new IllegalArgumentException("fst must be non-null");
  }
  this.fstReader = fst.getBytesReader();

  // Must be 1+ so that when roll buffer is at full
  // lookahead we can distinguish this full buffer from
  // the empty buffer:
  rollBufferSize = 1+synonyms.maxHorizontalContext;

  futureInputs = new PendingInput[rollBufferSize];
  futureOutputs = new PendingOutputs[rollBufferSize];
  for(int pos=0;pos<rollBufferSize;pos++) {
    futureInputs[pos] = new PendingInput();
    futureOutputs[pos] = new PendingOutputs();
  }

  //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

  scratchArc = new FST.Arc<>();
}
 
Example 16
Source File: DynamicSynonymGraphFilter.java    From elasticsearch-analysis-dynamic-synonym with Apache License 2.0 5 votes vote down vote up
/**
 * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug
 *
 * @param synonymMap
 */
@Override
public void update(SynonymMap synonymMap) {
    this.synonyms = synonymMap;
    this.fst = synonyms.fst;
    if(this.fst == null) {
        throw new IllegalArgumentException("fst must be non-null");
    } else {
        this.fstReader = this.fst.getBytesReader();
        this.scratchArc = new FST.Arc();
        //this.ignoreCase = ignoreCase;
    }
}
 
Example 17
Source File: OrdsSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST.Arc<Output> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Output>[] next =
    new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example 18
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Lookup suggestions sorted by weight (descending order).
 * 
 * @param collectAll
 *          If <code>true</code>, the routine terminates immediately when
 *          <code>num</code> suggestions have been collected. If
 *          <code>false</code>, it will collect suggestions from all weight
 *          arcs (needed for {@link #lookupSortedAlphabetically}.
 */
private ArrayList<Completion> lookupSortedByWeight(BytesRef key, 
    int num, boolean collectAll) throws IOException {
  // Don't overallocate the results buffers. This also serves the purpose of
  // allowing the user of this class to request all matches using Integer.MAX_VALUE as
  // the number of results.
  final ArrayList<Completion> res = new ArrayList<>(Math.min(10, num));

  final BytesRef output = BytesRef.deepCopyOf(key);
  for (int i = 0; i < rootArcs.length; i++) {
    final FST.Arc<Object> rootArc = rootArcs[i];
    final FST.Arc<Object> arc = new FST.Arc<>().copyFrom(rootArc);

    // Descend into the automaton using the key as prefix.
    if (descendWithPrefix(arc, key)) {
      // A subgraph starting from the current node has the completions
      // of the key prefix. The arc we're at is the last key's byte,
      // so we will collect it too.
      output.length = key.length - 1;
      if (collect(res, num, rootArc.label(), output, arc) && !collectAll) {
        // We have enough suggestions to return immediately. Keep on looking
        // for an
        // exact match, if requested.
        if (exactFirst) {
          if (!checkExistingAndReorder(res, key)) {
            int exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
            if (exactMatchBucket != -1) {
              // Insert as the first result and truncate at num.
              while (res.size() >= num) {
                res.remove(res.size() - 1);
              }
              res.add(0, new Completion(key, exactMatchBucket));
            }
          }
        }
        break;
      }
    }
  }
  return res;
}
 
Example 19
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
Frame() {
  this.fstArc = new FST.Arc<>();
  this.fsaState = -1;
}
 
Example 20
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
  return fst.getFirstArc(arc);
}