org.apache.lucene.util.fst.FST Java Examples

The following examples show how to use org.apache.lucene.util.fst.FST. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NormalizeCharMap.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
 
Example #2
Source File: DynamicSynonymFilter.java    From elasticsearch-analysis-dynamic-synonym with Apache License 2.0 6 votes vote down vote up
/**
 * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug
 *
 * @param synonymMap
 */
@Override
public void update(SynonymMap synonymMap) {
    this.synonyms = synonymMap;
    this.fst = synonyms.fst;
    if(this.fst == null) {
        throw new IllegalArgumentException("fst must be non-null");
    } else {
        this.fstReader = this.fst.getBytesReader();
        this.rollBufferSize = 1 + synonyms.maxHorizontalContext;
        this.futureInputs = new DynamicSynonymFilter.PendingInput[this.rollBufferSize];
        this.futureOutputs = new DynamicSynonymFilter.PendingOutputs[this.rollBufferSize];

        for(int pos = 0; pos < this.rollBufferSize; ++pos) {
            this.futureInputs[pos] = new DynamicSynonymFilter.PendingInput();
            this.futureOutputs[pos] = new DynamicSynonymFilter.PendingOutputs();
        }

        this.scratchArc = new FST.Arc();
    }
}
 
Example #3
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Lookup words in text
 * @param chars text
 * @param off offset into text
 * @param len length of text
 * @return array of wordId
 */
public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
  List<Integer> result = new ArrayList<>();
  final FST.BytesReader fstReader = fst.getBytesReader();

  FST.Arc<Long> arc = new FST.Arc<>();
  int end = off + len;
  for (int startOffset = off; startOffset < end; startOffset++) {
    arc = fst.getFirstArc(arc);
    int output = 0;
    int remaining = end - startOffset;
    for (int i = 0; i < remaining; i++) {
      int ch = chars[startOffset+i];
      if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
        break; // continue to next position
      }
      output += arc.output().intValue();
      if (arc.isFinal()) {
        final int finalOutput = output + arc.nextFinalOutput().intValue();
        result.add(finalOutput);
      }
    }
  }
  return result;
}
 
Example #4
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target();
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
 
Example #5
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException {
  scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.maxIDVersion = Long.MAX_VALUE - frameData.output2;
  f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData.output1);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example #6
Source File: TrieBuilder.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example #7
Source File: OrdsSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc<Output> arc, Output frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  // System.out.println("    fpSeek=" + fpSeek);
  final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;

  // Must setFloorData before pushFrame in case pushFrame tries to rewind:
  if (f.isFloor) {
    f.termOrdOrig = frameData.startOrd;
    f.setFloorData(scratchReader, frameData.bytes);
  }

  pushFrame(arc, fpSeek, length, frameData.startOrd);

  return f;
}
 
Example #8
Source File: FuzzySuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {

  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...

  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
  /*
    Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
    w.write(levA.toDot());
    w.close();
    System.out.println("Wrote LevA to out.dot");
  */
  return FSTUtil.intersectPrefixPaths(levA, fst);
}
 
Example #9
Source File: FstDecompounder.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Consume a maximal glue morpheme, if any, and consume the next word.
 */
private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder,
                               IntsRefBuilder maxPathsBuilder,
                               Deque<Chunk> chunks) throws IOException {
    FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>());
    BytesReader br = glueMorphemes.getBytesReader();
    for (int i = offset; i < utf32.length; i++) {
        int chr = utf32.ints[i];
        arc = glueMorphemes.findTargetArc(chr, arc, arc, br);
        if (arc == null) {
            break;
        }
        if (arc.isFinal()) {
            chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME));
            if (i + 1 < utf32.offset + utf32.length) {
                matchWord(utf32, i + 1, builder, maxPathsBuilder, chunks);
            }
            chunks.removeLast();
        }
    }
}
 
Example #10
Source File: NormalizeCharMap.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader);
        while(true) {
          assert scratchArc.label() != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
 
Example #11
Source File: StemmerOverrideFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort();
  IntsRefBuilder intsSpare = new IntsRefBuilder();
  final int size = hash.size();
  BytesRef spare = new BytesRef();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    intsSpare.copyUTF8Bytes(bytesRef);
    fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
}
 
Example #12
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Cache the root node's output arcs starting with completions with the
 * highest weights.
 */
@SuppressWarnings({"unchecked","rawtypes"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
  try {
    List<Arc<Object>> rootArcs = new ArrayList<>();
    Arc<Object> arc = automaton.getFirstArc(new Arc<>());
    FST.BytesReader fstReader = automaton.getBytesReader();
    automaton.readFirstTargetArc(arc, arc, fstReader);
    while (true) {
      rootArcs.add(new Arc<>().copyFrom(arc));
      if (arc.isLast()) break;
      automaton.readNextArc(arc, fstReader);
    }
    
    Collections.reverse(rootArcs); // we want highest weights first.
    return rootArcs.toArray(new Arc[rootArcs.size()]);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #13
Source File: IntersectTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST.Arc<BytesRef> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
    new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example #14
Source File: DynamicSynonymGraphFilter.java    From elasticsearch-analysis-dynamic-synonym with Apache License 2.0 5 votes vote down vote up
/**
 * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug
 *
 * @param synonymMap
 */
@Override
public void update(SynonymMap synonymMap) {
    this.synonyms = synonymMap;
    this.fst = synonyms.fst;
    if(this.fst == null) {
        throw new IllegalArgumentException("fst must be non-null");
    } else {
        this.fstReader = this.fst.getBytesReader();
        this.scratchArc = new FST.Arc();
        //this.ignoreCase = ignoreCase;
    }
}
 
Example #15
Source File: VersionBlockTreeTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}
 
Example #16
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, long fp, int length) throws IOException {
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.arc = arc;
  if (f.fpOrig == fp && f.nextEnt != -1) {
    //if (DEBUG) System.out.println("      push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
    if (f.prefix > targetBeforeCurrentLength) {
      f.rewind();
    } else {
      // if (DEBUG) {
      //   System.out.println("        skip rewind!");
      // }
    }
    assert length == f.prefix;
  } else {
    f.nextEnt = -1;
    f.prefix = length;
    f.state.termBlockOrd = 0;
    f.fpOrig = f.fp = fp;
    f.lastSubFP = -1;
    // if (DEBUG) {
    //   final int sav = term.length;
    //   term.length = length;
    //   System.out.println("      push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
    //   term.length = sav;
    // }
  }

  return f;
}
 
Example #17
Source File: IDVersionSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST.Arc<Pair<BytesRef,Long>> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Pair<BytesRef,Long>>[] next =
    new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example #18
Source File: SynonymFilter.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
@Override
public void reset() throws IOException {
  super.reset();
  captureCount = 0;
  finished = false;
  inputSkipCount = 0;
  nextRead = nextWrite = 0;

  // In normal usage these resets would not be needed,
  // since they reset-as-they-are-consumed, but the app
  // may not consume all input tokens (or we might hit an
  // exception), in which case we have leftover state
  // here:
  for (final PendingInput input : futureInputs) {
    input.reset();
  }
  for (final PendingOutputs output : futureOutputs) {
    output.reset();
  }

  if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) {
      lastModified = synonymLoader.getLastModified();
      final SynonymMap map = synonymLoader.getSynonymMap();
      if (map != null) {
          synonyms = map;
          fst = synonyms.fst;
          if (fst == null) {
              throw new IllegalArgumentException("fst must be non-null");
          }
          fstReader = fst.getBytesReader();
          scratchArc = new FST.Arc<>();
          clearAttributes();
      }
  }
}
 
Example #19
Source File: FSTCompletion.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the first exact match by traversing root arcs, starting from the
 * arc <code>rootArcIndex</code>.
 * 
 * @param rootArcIndex
 *          The first root arc index in {@link #rootArcs} to consider when
 *          matching.
 * 
 * @param utf8
 *          The sequence of utf8 bytes to follow.
 * 
 * @return Returns the bucket number of the match or <code>-1</code> if no
 *         match was found.
 */
private int getExactMatchStartingFromRootArc(
    int rootArcIndex, BytesRef utf8) {
  // Get the UTF-8 bytes representation of the input key.
  try {
    final FST.Arc<Object> scratch = new FST.Arc<>();
    FST.BytesReader fstReader = automaton.getBytesReader();
    for (; rootArcIndex < rootArcs.length; rootArcIndex++) {
      final FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
      final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
      
      // Descend into the automaton using the key as prefix.
      if (descendWithPrefix(arc, utf8)) {
        automaton.readFirstTargetArc(arc, arc, fstReader);
        if (arc.label() == FST.END_LABEL) {
          // Normalize prefix-encoded weight.
          return rootArc.label();
        }
      }
    }
  } catch (IOException e) {
    // Should never happen, but anyway.
    throw new RuntimeException(e);
  }
  
  // No match.
  return -1;
}
 
Example #20
Source File: AnalyzingSuggester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Returns all prefix paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {
  return prefixPaths;
}
 
Example #21
Source File: FSTTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
TermsWriter(FieldInfo fieldInfo) {
  this.numTerms = 0;
  this.fieldInfo = fieldInfo;
  postingsWriter.setField(fieldInfo);
  this.outputs = new FSTTermOutputs(fieldInfo);
  this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
 
Example #22
Source File: DatawaveArithmetic.java    From datawave with Apache License 2.0 5 votes vote down vote up
public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}
 
Example #23
Source File: TokenInfoFST.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes","unchecked"})
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
  FST.Arc<Long>[] rootCache = new FST.Arc[1+(cacheCeiling-0xAC00)];
  FST.Arc<Long> firstArc = new FST.Arc<>();
  fst.getFirstArc(firstArc);
  FST.Arc<Long> arc = new FST.Arc<>();
  final FST.BytesReader fstReader = fst.getBytesReader();
  // TODO: jump to AC00, readNextRealArc to ceiling? (just be careful we don't add bugs)
  for (int i = 0; i < rootCache.length; i++) {
    if (fst.findTargetArc(0xAC00 + i, firstArc, arc, fstReader) != null) {
      rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
    }
  }
  return rootCache;
}
 
Example #24
Source File: Dictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
  IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
  FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
    Util.toUTF32(entry.getKey(), scratch);
    List<Integer> entries = entry.getValue();
    IntsRef output = new IntsRef(entries.size());
    for (Integer c : entries) {
      output.ints[output.length++] = c;
    }
    fstCompiler.add(scratch.get(), output);
  }
  return fstCompiler.compile();
}
 
Example #25
Source File: SegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, long fp, int length) throws IOException {
  final SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.arc = arc;
  if (f.fpOrig == fp && f.nextEnt != -1) {
    //if (DEBUG) System.out.println("      push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
    //if (f.prefix > targetBeforeCurrentLength) {
    if (f.ord > targetBeforeCurrentLength) {
      f.rewind();
    } else {
      // if (DEBUG) {
      //   System.out.println("        skip rewind!");
      // }
    }
    assert length == f.prefix;
  } else {
    f.nextEnt = -1;
    f.prefix = length;
    f.state.termBlockOrd = 0;
    f.fpOrig = f.fp = fp;
    f.lastSubFP = -1;
    // if (DEBUG) {
    //   final int sav = term.length;
    //   term.length = length;
    //   System.out.println("      push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
    //   term.length = sav;
    // }
  }

  return f;
}
 
Example #26
Source File: VariableGapTermsIndexWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
  this.fieldInfo = fieldInfo;
  fstOutputs = PositiveIntOutputs.getSingleton();
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
  indexStart = out.getFilePointer();
  ////System.out.println("VGW: field=" + fieldInfo.name);

  // Always put empty string in
  fstCompiler.add(new IntsRef(), termsFilePointer);
  startTermsFilePointer = termsFilePointer;
}
 
Example #27
Source File: SynonymFilter.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
/**
 * @param input input tokenstream
 * @param synonymLoader synonym loader
 * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
 *                   Note, if you set this to true, it's your responsibility to lowercase
 *                   the input entries when you create the {@link SynonymMap}
 */
public SynonymFilter(final TokenStream input, final SynonymLoader synonymLoader, final boolean ignoreCase) {
  super(input);
  if (synonymLoader != null) {
      if (synonymLoader.isReloadable()) {
          this.synonymLoader = synonymLoader;
          this.lastModified = synonymLoader.getLastModified();
      } else {
          this.synonymLoader = null;
          this.lastModified = System.currentTimeMillis();
      }
      this.synonyms = synonymLoader.getSynonymMap();
  }
  if (synonyms == null) {
      throw new IllegalArgumentException("synonyms must be non-null");
  }
  this.ignoreCase = ignoreCase;
  this.fst = synonyms.fst;
  if (fst == null) {
    throw new IllegalArgumentException("fst must be non-null");
  }
  this.fstReader = fst.getBytesReader();

  // Must be 1+ so that when roll buffer is at full
  // lookahead we can distinguish this full buffer from
  // the empty buffer:
  rollBufferSize = 1+synonyms.maxHorizontalContext;

  futureInputs = new PendingInput[rollBufferSize];
  futureOutputs = new PendingOutputs[rollBufferSize];
  for(int pos=0;pos<rollBufferSize;pos++) {
    futureInputs[pos] = new PendingInput();
    futureOutputs[pos] = new PendingOutputs();
  }

  //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

  scratchArc = new FST.Arc<>();
}
 
Example #28
Source File: OrdsSegmentTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private FST.Arc<Output> getArc(int ord) {
  if (ord >= arcs.length) {
    @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Output>[] next =
    new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
    System.arraycopy(arcs, 0, next, 0, arcs.length);
    for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
      next[arcOrd] = new FST.Arc<>();
    }
    arcs = next;
  }
  return arcs[ord];
}
 
Example #29
Source File: FSTTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
  // save FST dict
  if (numTerms > 0) {
    final FST<FSTTermOutputs.TermData> fst = fstCompiler.compile();
    fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, fst));
  }
}
 
Example #30
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
Frame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final Frame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & BlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & BlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}