org.apache.lucene.util.fst.PositiveIntOutputs Java Examples

The following examples show how to use org.apache.lucene.util.fst.PositiveIntOutputs. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TrieBuilder.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example #2
Source File: FreeTextSuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
  count = input.readVLong();
  byte separatorOrig = input.readByte();
  if (separatorOrig != separator) {
    throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
  }
  int gramsOrig = input.readVInt();
  if (gramsOrig != grams) {
    throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
  }
  totTokens = input.readVLong();

  fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());

  return true;
}
 
Example #3
Source File: NRTSuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput} on or off-heap
 * depending on the provided <code>fstLoadMode</code>
 */
public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException {
  final FST<Pair<Long, BytesRef>> fst;
  if (shouldLoadFSTOffHeap(input, fstLoadMode)) {
    OffHeapFSTStore store = new OffHeapFSTStore();
    IndexInput clone = input.clone();
    clone.seek(input.getFilePointer());
    fst = new FST<>(clone, clone, new PairOutputs<>(
        PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()), store);
    input.seek(clone.getFilePointer() + store.size());
  } else {
    fst = new FST<>(input, input, new PairOutputs<>(
        PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  }

  /* read some meta info */
  int maxAnalyzedPathsPerOutput = input.readVInt();
  /*
   * Label used to denote the end of an input in the FST and
   * the beginning of dedup bytes
   */
  int endByte = input.readVInt();
  int payloadSep = input.readVInt();
  return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep);
}
 
Example #4
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(InputStream input) throws IOException {
  DataInput dataIn = new InputStreamDataInput(input);
  try {
    this.fst = new FST<>(dataIn, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
    maxAnalyzedPathsForOneInput = dataIn.readVInt();
    hasPayloads = dataIn.readByte() == 1;
  } finally {
    IOUtils.close(input);
  }
  return true;
}
 
Example #5
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  maxAnalyzedPathsForOneInput = input.readVInt();
  hasPayloads = input.readByte() == 1;
  return true;
}
 
Example #6
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
    this.payloadSep = payloadSep;
    this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
    this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
    this.hasPayloads = hasPayloads;
    surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];

}
 
Example #7
Source File: VariableGapTermsIndexWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
  this.fieldInfo = fieldInfo;
  fstOutputs = PositiveIntOutputs.getSingleton();
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
  indexStart = out.getFilePointer();
  ////System.out.println("VGW: field=" + fieldInfo.name);

  // Always put empty string in
  fstCompiler.add(new IntsRef(), termsFilePointer);
  startTermsFilePointer = termsFilePointer;
}
 
Example #8
Source File: AnalyzingSuggester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  maxAnalyzedPathsForOneInput = input.readVInt();
  hasPayloads = input.readByte() == 1;
  return true;
}
 
Example #9
Source File: NRTSuggesterBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Create a builder for {@link NRTSuggester}
 */
public NRTSuggesterBuilder() {
  this.payloadSep = PAYLOAD_SEP;
  this.endByte = END_BYTE;
  this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
  this.entries = new PriorityQueue<>();
  this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
 
Example #10
Source File: TokenInfoDictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
 * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
 * this class's name as the path.
 */
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
  super(resourceScheme, resourcePath);
  FST<Long> fst;
  try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
    DataInput in = new InputStreamDataInput(is);
    fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
  }
  // TODO: some way to configure?
  this.fst = new TokenInfoFST(fst, true);
}
 
Example #11
Source File: TokenInfoDictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
 * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
 * this class's name as the path.
 */
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
  super(resourceScheme, resourcePath);
  FST<Long> fst;
  try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
    DataInput in = new InputStreamDataInput(is);
    fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
  }
  this.fst = new TokenInfoFST(fst);
}
 
Example #12
Source File: BooleanPerceptronClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}
 
Example #13
Source File: SimpleTextFieldsReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}
 
Example #14
Source File: FSTDictionary.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public Builder() {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  scratchInts = new IntsRefBuilder();
}
 
Example #15
Source File: WFSTCompletionLookup.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
  return true;
}
 
Example #16
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
      @Override
      public int compare(String[] left, String[] right) {
        return left[0].compareTo(right[0]);
     }
    });
    
    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());
    
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    
    for (String[] values : featureEntries) {
      String surface = values[0].replaceAll("\\s", "");
      String concatenatedSegment = values[1].replaceAll("\\s", "");
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];
      
      if (segmentation.length != readings.length) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }

      if (!surface.equals(concatenatedSegment)) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the concatenated segmentation (" + concatenatedSegment + ")" +
                                   " does not match the surface form (" + surface + ")");
      }
      
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstCompiler.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstCompiler.compile(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }