Java Code Examples for org.apache.lucene.util.fst.PositiveIntOutputs

The following examples show how to use org.apache.lucene.util.fst.PositiveIntOutputs. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: ambiverse-nlu   Source File: TrieBuilder.java    License: Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example 2
Source Project: lucene-solr   Source File: FreeTextSuggester.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
  count = input.readVLong();
  byte separatorOrig = input.readByte();
  if (separatorOrig != separator) {
    throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
  }
  int gramsOrig = input.readVInt();
  if (gramsOrig != grams) {
    throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
  }
  totTokens = input.readVLong();

  fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());

  return true;
}
 
Example 3
Source Project: lucene-solr   Source File: NRTSuggester.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput} on or off-heap
 * depending on the provided <code>fstLoadMode</code>
 */
public static NRTSuggester load(IndexInput input, FSTLoadMode fstLoadMode) throws IOException {
  final FST<Pair<Long, BytesRef>> fst;
  if (shouldLoadFSTOffHeap(input, fstLoadMode)) {
    OffHeapFSTStore store = new OffHeapFSTStore();
    IndexInput clone = input.clone();
    clone.seek(input.getFilePointer());
    fst = new FST<>(clone, clone, new PairOutputs<>(
        PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()), store);
    input.seek(clone.getFilePointer() + store.size());
  } else {
    fst = new FST<>(input, input, new PairOutputs<>(
        PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  }

  /* read some meta info */
  int maxAnalyzedPathsPerOutput = input.readVInt();
  /*
   * Label used to denote the end of an input in the FST and
   * the beginning of dedup bytes
   */
  int endByte = input.readVInt();
  int payloadSep = input.readVInt();
  return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep);
}
 
Example 4
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(InputStream input) throws IOException {
  DataInput dataIn = new InputStreamDataInput(input);
  try {
    this.fst = new FST<>(dataIn, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
    maxAnalyzedPathsForOneInput = dataIn.readVInt();
    hasPayloads = dataIn.readByte() == 1;
  } finally {
    IOUtils.close(input);
  }
  return true;
}
 
Example 5
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  maxAnalyzedPathsForOneInput = input.readVInt();
  hasPayloads = input.readByte() == 1;
  return true;
}
 
Example 6
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
    this.payloadSep = payloadSep;
    this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
    this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
    this.hasPayloads = hasPayloads;
    surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];

}
 
Example 7
Source Project: lucene-solr   Source File: VariableGapTermsIndexWriter.java    License: Apache License 2.0 5 votes vote down vote up
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
  this.fieldInfo = fieldInfo;
  fstOutputs = PositiveIntOutputs.getSingleton();
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
  indexStart = out.getFilePointer();
  ////System.out.println("VGW: field=" + fieldInfo.name);

  // Always put empty string in
  fstCompiler.add(new IntsRef(), termsFilePointer);
  startTermsFilePointer = termsFilePointer;
}
 
Example 8
Source Project: lucene-solr   Source File: AnalyzingSuggester.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
  maxAnalyzedPathsForOneInput = input.readVInt();
  hasPayloads = input.readByte() == 1;
  return true;
}
 
Example 9
Source Project: lucene-solr   Source File: NRTSuggesterBuilder.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Create a builder for {@link NRTSuggester}
 */
public NRTSuggesterBuilder() {
  this.payloadSep = PAYLOAD_SEP;
  this.endByte = END_BYTE;
  this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
  this.entries = new PriorityQueue<>();
  this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
}
 
Example 10
Source Project: lucene-solr   Source File: TokenInfoDictionary.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
 * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
 * this class's name as the path.
 */
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
  super(resourceScheme, resourcePath);
  FST<Long> fst;
  try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
    DataInput in = new InputStreamDataInput(is);
    fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
  }
  // TODO: some way to configure?
  this.fst = new TokenInfoFST(fst, true);
}
 
Example 11
Source Project: lucene-solr   Source File: TokenInfoDictionary.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
 * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
 * this class's name as the path.
 */
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
  super(resourceScheme, resourcePath);
  FST<Long> fst;
  try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
    DataInput in = new InputStreamDataInput(is);
    fst = new FST<>(in, in, PositiveIntOutputs.getSingleton());
  }
  this.fst = new TokenInfoFST(fst);
}
 
Example 12
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 5 votes vote down vote up
private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}
 
Example 13
Source Project: lucene-solr   Source File: SimpleTextFieldsReader.java    License: Apache License 2.0 4 votes vote down vote up
private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}
 
Example 14
Source Project: lucene-solr   Source File: FSTDictionary.java    License: Apache License 2.0 4 votes vote down vote up
public Builder() {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  scratchInts = new IntsRefBuilder();
}
 
Example 15
Source Project: lucene-solr   Source File: WFSTCompletionLookup.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public boolean load(DataInput input) throws IOException {
  count = input.readVLong();
  this.fst = new FST<>(input, input, PositiveIntOutputs.getSingleton());
  return true;
}
 
Example 16
Source Project: lucene-solr   Source File: UserDictionary.java    License: Apache License 2.0 4 votes vote down vote up
private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
      @Override
      public int compare(String[] left, String[] right) {
        return left[0].compareTo(right[0]);
     }
    });
    
    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());
    
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    
    for (String[] values : featureEntries) {
      String surface = values[0].replaceAll("\\s", "");
      String concatenatedSegment = values[1].replaceAll("\\s", "");
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];
      
      if (segmentation.length != readings.length) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }

      if (!surface.equals(concatenatedSegment)) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the concatenated segmentation (" + concatenatedSegment + ")" +
                                   " does not match the surface form (" + surface + ")");
      }
      
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstCompiler.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstCompiler.compile(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }