Java Code Examples for org.apache.lucene.util.fst.FST

The following examples show how to use org.apache.lucene.util.fst.FST. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: ambiverse-nlu   Source File: TrieBuilder.java    License: Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example 2
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader);
        while(true) {
          assert scratchArc.label() != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
 
Example 3
/**
 * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug
 *
 * @param synonymMap
 */
@Override
public void update(SynonymMap synonymMap) {
    this.synonyms = synonymMap;
    this.fst = synonyms.fst;
    if(this.fst == null) {
        throw new IllegalArgumentException("fst must be non-null");
    } else {
        this.fstReader = this.fst.getBytesReader();
        this.rollBufferSize = 1 + synonyms.maxHorizontalContext;
        this.futureInputs = new DynamicSynonymFilter.PendingInput[this.rollBufferSize];
        this.futureOutputs = new DynamicSynonymFilter.PendingOutputs[this.rollBufferSize];

        for(int pos = 0; pos < this.rollBufferSize; ++pos) {
            this.futureInputs[pos] = new DynamicSynonymFilter.PendingInput();
            this.futureOutputs[pos] = new DynamicSynonymFilter.PendingOutputs();
        }

        this.scratchArc = new FST.Arc();
    }
}
 
Example 4
Source Project: lucene-solr   Source File: FSTTermsReader.java    License: Apache License 2.0 6 votes vote down vote up
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target();
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
 
Example 5
Source Project: lucene-solr   Source File: OrdsSegmentTermsEnum.java    License: Apache License 2.0 6 votes vote down vote up
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc<Output> arc, Output frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  // System.out.println("    fpSeek=" + fpSeek);
  final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;

  // Must setFloorData before pushFrame in case pushFrame tries to rewind:
  if (f.isFloor) {
    f.termOrdOrig = frameData.startOrd;
    f.setFloorData(scratchReader, frameData.bytes);
  }

  pushFrame(arc, fpSeek, length, frameData.startOrd);

  return f;
}
 
Example 6
Source Project: lucene-solr   Source File: FuzzySuggester.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {

  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...

  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
  /*
    Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
    w.write(levA.toDot());
    w.close();
    System.out.println("Wrote LevA to out.dot");
  */
  return FSTUtil.intersectPrefixPaths(levA, fst);
}
 
Example 7
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
 
Example 8
/**
 * Consume a maximal glue morpheme, if any, and consume the next word.
 */
private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder,
                               IntsRefBuilder maxPathsBuilder,
                               Deque<Chunk> chunks) throws IOException {
    FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>());
    BytesReader br = glueMorphemes.getBytesReader();
    for (int i = offset; i < utf32.length; i++) {
        int chr = utf32.ints[i];
        arc = glueMorphemes.findTargetArc(chr, arc, arc, br);
        if (arc == null) {
            break;
        }
        if (arc.isFinal()) {
            chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME));
            if (i + 1 < utf32.offset + utf32.length) {
                matchWord(utf32, i + 1, builder, maxPathsBuilder, chunks);
            }
            chunks.removeLast();
        }
    }
}
 
Example 9
Source Project: lucene-solr   Source File: FSTCompletion.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Cache the root node's output arcs starting with completions with the
 * highest weights.
 */
@SuppressWarnings({"unchecked","rawtypes"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
  try {
    List<Arc<Object>> rootArcs = new ArrayList<>();
    Arc<Object> arc = automaton.getFirstArc(new Arc<>());
    FST.BytesReader fstReader = automaton.getBytesReader();
    automaton.readFirstTargetArc(arc, arc, fstReader);
    while (true) {
      rootArcs.add(new Arc<>().copyFrom(arc));
      if (arc.isLast()) break;
      automaton.readNextArc(arc, fstReader);
    }
    
    Collections.reverse(rootArcs); // we want highest weights first.
    return rootArcs.toArray(new Arc[rootArcs.size()]);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example 10
Source Project: lucene-solr   Source File: StemmerOverrideFilter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort();
  IntsRefBuilder intsSpare = new IntsRefBuilder();
  final int size = hash.size();
  BytesRef spare = new BytesRef();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    intsSpare.copyUTF8Bytes(bytesRef);
    fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
}
 
Example 11
Source Project: lucene-solr   Source File: IDVersionSegmentTermsEnum.java    License: Apache License 2.0 6 votes vote down vote up
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException {
  scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.maxIDVersion = Long.MAX_VALUE - frameData.output2;
  f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData.output1);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example 12
Source Project: lucene-solr   Source File: UserDictionary.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Lookup words in text
 * @param chars text
 * @param off offset into text
 * @param len length of text
 * @return array of wordId
 */
public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
  List<Integer> result = new ArrayList<>();
  final FST.BytesReader fstReader = fst.getBytesReader();

  FST.Arc<Long> arc = new FST.Arc<>();
  int end = off + len;
  for (int startOffset = off; startOffset < end; startOffset++) {
    arc = fst.getFirstArc(arc);
    int output = 0;
    int remaining = end - startOffset;
    for (int i = 0; i < remaining; i++) {
      int ch = chars[startOffset+i];
      if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
        break; // continue to next position
      }
      output += arc.output().intValue();
      if (arc.isFinal()) {
        final int finalOutput = output + arc.nextFinalOutput().intValue();
        result.add(finalOutput);
      }
    }
  }
  return result;
}
 
Example 13
Source Project: datawave   Source File: DatawaveArithmetic.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}
 
Example 14
Source Project: datawave   Source File: PushdownLargeFieldedListsVisitor.java    License: Apache License 2.0 5 votes vote down vote up
protected URI createFst(SortedSet<String> values) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    FST fst = DatawaveFieldIndexListIteratorJexl.getFST(values);
    
    // now serialize to our file system
    CompressionCodec codec = null;
    String extension = "";
    if (config.getHdfsFileCompressionCodec() != null) {
        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
        if (classLoader == null) {
            classLoader = this.getClass().getClassLoader();
        }
        Class<? extends CompressionCodec> clazz = Class.forName(config.getHdfsFileCompressionCodec(), true, classLoader).asSubclass(CompressionCodec.class);
        codec = clazz.newInstance();
        extension = codec.getDefaultExtension();
    }
    int fstCount = config.getFstCount().incrementAndGet();
    Path fstFile = new Path(fstHdfsUri, "PushdownLargeFileFst." + fstCount + ".fst" + extension);
    
    OutputStream fstFileOut = new BufferedOutputStream(fs.create(fstFile, false));
    if (codec != null) {
        fstFileOut = codec.createOutputStream(fstFileOut);
    }
    
    OutputStreamDataOutput outStream = new OutputStreamDataOutput(fstFileOut);
    fst.save(outStream);
    outStream.close();
    
    return fstFile.toUri();
}
 
Example 15
Source Project: lucene-solr   Source File: BlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<BytesRef>> subIndices) {
  super(false);
  this.prefix = prefix;
  this.fp = fp;
  this.hasTerms = hasTerms;
  this.isFloor = isFloor;
  this.floorLeadByte = floorLeadByte;
  this.subIndices = subIndices;
}
 
Example 16
public static synchronized FST<Object> get(Path fstfile, String compressedCodec, FileSystem fs) throws IOException {
    if (fstfile == null)
        throw new NullPointerException("input fst key was null");
    FST<Object> fst = fstCache.get(fstfile);
    if (fst != null) {
        return fst;
    }
    
    // Attempt to load fst from HDFS
    fst = loadFSTFromFile(fstfile, compressedCodec, fs);
    fstCache.put(fstfile, fst);
    return fst;
}
 
Example 17
Source Project: ambiverse-nlu   Source File: FSTCursor.java    License: Apache License 2.0 5 votes vote down vote up
public FSTCursor(FST<Long> fst, int characterStart) {
  this.fst = fst;
  arc = fst.getFirstArc(new FST.Arc<Long>());
  output = fst.outputs.getNoOutput();
  fstReader = fst.getBytesReader();
  this.characterStart = characterStart;
}
 
Example 18
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldSpotOneWord() throws IOException {

	Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Германия".toLowerCase()));
	FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);
	Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText, begins, ends, 1.0);
	assertEquals(1, spots.size());
}
 
Example 19
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldSpotOneFuzzyWord() throws IOException {

	Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Германие".toLowerCase())); //87.5% match
	FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);

	Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText, begins, ends, 0.8);
	assertEquals(1, spots.size());
	Spot spot = spots.iterator().next();
	assertEquals("Германия", sentenceText.substring(spot.getBegin(), spot.getEnd()));
}
 
Example 20
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldNotSpotOneFuzzyWord() throws IOException {

	Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Einstei1".toLowerCase())); //87.5% match
	FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);

	Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText, begins, ends, 0.8);
	assertEquals(1, spots.size());
	Spot spot = spots.iterator().next();
	assertEquals("Einstein", sentenceText.substring(spot.getBegin(), spot.getEnd()));
}
 
Example 21
Source Project: lucene-solr   Source File: BlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}
 
Example 22
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
public void shouldSpotTwoFuzzyWords() throws IOException {

		Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Alber1 Einstei2".toLowerCase())); //76.47% match
		FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);

		Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText, begins, ends, 0.90);
		assertEquals(1, spots.size());
		Spot spot = spots.iterator().next();
		assertEquals("Albert Einstein", sentenceText.substring(spot.getBegin(), spot.getEnd()));
	}
 
Example 23
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldNotSpotGatessssWord() throws IOException {

	Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Gates".toLowerCase()));
	FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);

	Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText2, begins, ends, 0.8);
	assertEquals(0, spots.size());
}
 
Example 24
Source Project: ambiverse-nlu   Source File: TextSpotterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void shouldNotSpotShorterMerkeWord() throws IOException {

	Set<String> sortedMentions = new TreeSet<>(Collections.singletonList("Merkel".toLowerCase()));
	FST<Long> trie = TrieBuilder.buildTrie(sortedMentions);

	Set<Spot> spots = TextSpotter.spotTrieEntriesInTextIgnoreCase(trie, sentenceText, begins, ends, 0.9);
	assertEquals(0, spots.size());
}
 
Example 25
Source Project: lucene-solr   Source File: SegmentTermsEnum.java    License: Apache License 2.0 5 votes vote down vote up
SegmentTermsEnumFrame pushFrame(FST.Arc<BytesRef> arc, BytesRef frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes, frameData.offset, frameData.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS;
  final SegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
 
Example 26
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 5 votes vote down vote up
private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}
 
Example 27
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new suggester.
 *
 * @param indexAnalyzer Analyzer that will be used for
 *   analyzing suggestions while building the index.
 * @param queryAnalyzer Analyzer that will be used for
 *   analyzing query text during lookup
 * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
 * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
 *   surface forms to keep for a single analyzed form.
 *   When there are too many surface forms we discard the
 *   lowest weighted ones.
 * @param maxGraphExpansions Maximum number of graph paths
 *   to expand from the analyzed form.  Set this to -1 for
 *   no limit.
 */
public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                           boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
                           int sepLabel, int payloadSep, int endByte, int holeCharacter) {
    // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
  this.indexAnalyzer = indexAnalyzer;
  this.queryAnalyzer = queryAnalyzer;
  this.fst = fst;
  this.hasPayloads = hasPayloads;
  if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
    throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
  }
  this.exactFirst = (options & EXACT_FIRST) != 0;
  this.preserveSep = (options & PRESERVE_SEP) != 0;

  // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
  this.queryPrefix = queryPrefix;

  // NOTE: this is just an implementation limitation; if
  // somehow this is a problem we could fix it by using
  // more than one byte to disambiguate ... but 256 seems
  // like it should be way more then enough.
  if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
    throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
  }
  this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

  if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
    throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
  }
  this.maxGraphExpansions = maxGraphExpansions;
  this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
  this.preservePositionIncrements = preservePositionIncrements;
  this.sepLabel = sepLabel;
  this.payloadSep = payloadSep;
  this.endByte = endByte;
  this.holeCharacter = holeCharacter;
}
 
Example 28
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(InputStream input) throws IOException {
  DataInput dataIn = new InputStreamDataInput(input);
  try {
    this.fst = new FST<>(dataIn, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
    maxAnalyzedPathsForOneInput = dataIn.readVInt();
    hasPayloads = dataIn.readByte() == 1;
  } finally {
    IOUtils.close(input);
  }
  return true;
}
 
Example 29
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  maxAnalyzedPathsForOneInput = input.readVInt();
  hasPayloads = input.readByte() == 1;
  return true;
}
 
Example 30
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
/** Returns all completion paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {
  return prefixPaths;
}