org.apache.lucene.util.fst.Util Java Examples

The following examples show how to use org.apache.lucene.util.fst.Util. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: DatawaveFieldIndexListIteratorJexl.java From datawave with Apache License 2.0

6 votes

public static FST<?> getFST(SortedSet<String> values) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    // The builder options with defaults
    FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1;
    int minSuffixCount1 = 0;
    int minSuffixCount2 = 0;
    boolean doShareSuffix = true;
    boolean doShareNonSingletonNodes = true;
    int shareMaxTailLength = Integer.MAX_VALUE;
    
    boolean allowArrayArcs = true;
    int bytesPageBits = 15;
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    
    // create the FST from the values
    org.apache.lucene.util.fst.Builder<Object> fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2,
                    doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits);
    
    for (String value : values) {
        Util.toUTF16(value, irBuilder);
        final IntsRef scratchInt = irBuilder.get();
        fstBuilder.add(scratchInt, outputs.getNoOutput());
    }
    return fstBuilder.finish();
}

Example #2

Source File: Dictionary.java From lucene-solr with Apache License 2.0

6 votes

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();
  
  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }
  
  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }
  
  return fstCompiler.compile();
}

Example #3

Source File: NormalizeCharMap.java From lucene-solr with Apache License 2.0

6 votes

/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}

Example #4

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

6 votes

@Override
public ClassificationResult<Boolean> assignClass(String text)
        throws IOException {
  Long output = 0L;
  try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
    CharTermAttribute charTermAttribute = tokenStream
            .addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
      String s = charTermAttribute.toString();
      Long d = Util.get(fst, new BytesRef(s));
      if (d != null) {
        output += d;
      }
    }
    tokenStream.end();
  }

  double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
  return new ClassificationResult<>(output >= bias, score);
}

Example #5

Source File: NRTSuggesterBuilder.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Writes all the entries for the FST input term
 */
public void finishTerm() throws IOException {
  int numArcs = 0;
  int numDedupBytes = 1;
  analyzed.grow(analyzed.length() + 1);
  analyzed.setLength(analyzed.length() + 1);
  for (Entry entry : entries) {
    if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
      analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
      analyzed.grow(analyzed.length() + 1);
      analyzed.setLength(analyzed.length() + 1);
      numArcs = 0;
      numDedupBytes++;
    }
    analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
  }
  maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
  entries.clear();
}

Example #6

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

6 votes

public void testFiniteStringsEatsStack() {
  char[] chars = new char[50000];
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString1 = new String(chars);
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString2 = new String(chars);
  Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
  Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
}

Example #7

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

public void finishTerm(long defaultWeight) throws IOException {
    ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
    int deduplicator = 0;
    analyzed.append((byte) 0);
    analyzed.setLength(analyzed.length() + 1);
    analyzed.grow(analyzed.length());
    for (int i = 0; i < count; i++) {
        analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
        Util.toIntsRef(analyzed.get(), scratchInts);
        SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
        long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
        builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
    }
    seenSurfaceForms.clear();
    count = 0;
}

Example #8

Source File: TrieBuilder.java From ambiverse-nlu with Apache License 2.0

6 votes

public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}

Example #9

Source File: TestAutomaton.java From lucene-solr with Apache License 2.0

5 votes

public void testMakeBinaryIntervalRandom() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    BytesRef minTerm = TestUtil.randomBinaryTerm(random());
    boolean minInclusive = random().nextBoolean();
    BytesRef maxTerm = TestUtil.randomBinaryTerm(random());
    boolean maxInclusive = random().nextBoolean();

    Automaton a = makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive);

    for(int iter2=0;iter2<500;iter2++) {
      BytesRef term = TestUtil.randomBinaryTerm(random());
      int minCmp = minTerm.compareTo(term);
      int maxCmp = maxTerm.compareTo(term);

      boolean expected;
      if (minCmp > 0 || maxCmp < 0) {
        expected = false;
      } else if (minCmp == 0 && maxCmp == 0) {
        expected = minInclusive && maxInclusive;
      } else if (minCmp == 0) {
        expected = minInclusive;
      } else if (maxCmp == 0) {
        expected = maxInclusive;
      } else {
        expected = true;
      }

      if (VERBOSE) {
        System.out.println("  check term=" + term + " expected=" + expected);
      }
      IntsRefBuilder intsBuilder = new IntsRefBuilder();
      Util.toIntsRef(term, intsBuilder);
      assertEquals(expected, Operations.run(a, intsBuilder.toIntsRef()));
    }
  }
}

Example #10

Source File: TestAutomaton.java From lucene-solr with Apache License 2.0

5 votes

private void assertMatches(Automaton a, String... strings) {
  Set<IntsRef> expected = new HashSet<>();
  for(String s : strings) {
    IntsRefBuilder ints = new IntsRefBuilder();
    expected.add(Util.toUTF32(s, ints));
  }

  assertEquals(expected, TestOperations.getFiniteStrings(
      Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES)));
}

Example #11

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testShortAccept() {
  Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder x = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("x"), x);
  assertTrue(actual.contains(x.get()));
  IntsRefBuilder xy = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("xy"), xy);
  assertTrue(actual.contains(xy.get()));
}

Example #12

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSingletonNoLimit() {
  Automaton a = Automata.makeString("foobar");
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(1, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
  assertTrue(actual.contains(scratch.get()));
}

Example #13

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleton() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    Automaton a = Automata.makeString(s);
    Automaton utf8 = new UTF32ToUTF8().convert(a);
    IntsRefBuilder ints = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef(s), ints);
    Set<IntsRef> set = new HashSet<>();
    set.add(ints.get());
    assertEquals(set, TestOperations.getFiniteStrings(utf8));
  }
}

Example #14

Source File: FiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Basic test for getFiniteStrings
 */
public void testFiniteStringsBasic() {
  Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertFiniteStringsRecursive(a, actual);
  assertEquals(2, actual.size());
  IntsRefBuilder dog = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("dog"), dog);
  assertTrue(actual.contains(dog.get()));
  IntsRefBuilder duck = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("duck"), duck);
  assertTrue(actual.contains(duck.get()));
}

Example #15

Source File: TestGraphTokenizers.java From lucene-solr with Apache License 2.0

5 votes

/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}

Example #16

Source File: BlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}

Example #17

Source File: BaseTokenStreamTestCase.java From lucene-solr with Apache License 2.0

5 votes

/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
  Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
  Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: actualStringPaths) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}

Example #18

Source File: DatawaveArithmetic.java From datawave with Apache License 2.0

5 votes

public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}

Example #19

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

5 votes

private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}

Example #20

Source File: LimitedFiniteStringsIteratorTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleton() {
  Automaton a = Automata.makeString("foobar");
  List<IntsRef> actual = getFiniteStrings(new LimitedFiniteStringsIterator(a, 1));
  assertEquals(1, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
  assertTrue(actual.contains(scratch.get()));
}

Example #21

Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0

5 votes

private void updateWeights(IndexReader indexReader,
                           int docId, Boolean assignedClass, SortedMap<String, Double> weights,
                           double modifier, boolean updateFST) throws IOException {
  TermsEnum cte = textTerms.iterator();

  // get the doc term vectors
  Terms terms = indexReader.getTermVector(docId, textFieldName);

  if (terms == null) {
    throw new IOException("term vectors must be stored for field "
            + textFieldName);
  }

  TermsEnum termsEnum = terms.iterator();

  BytesRef term;

  while ((term = termsEnum.next()) != null) {
    cte.seekExact(term);
    if (assignedClass != null) {
      long termFreqLocal = termsEnum.totalTermFreq();
      // update weights
      Long previousValue = Util.get(fst, term);
      String termString = term.utf8ToString();
      weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
    }
  }
  if (updateFST) {
    updateFST(weights);
  }
}

Example #22

Source File: CompletionTokenStream.java From Elasticsearch with Apache License 2.0

5 votes

@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    if (finiteStrings == null) {
        Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input);

        if (strings.size() > MAX_PATHS) {
            throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS
                    + " finite strings are supported");
        }
        posInc = strings.size();
        finiteStrings = strings.iterator();
    }
    if (finiteStrings.hasNext()) {
        posAttr.setPositionIncrement(posInc);
        /*
         * this posInc encodes the number of paths that this surface form
         * produced. Multi Fields have the same surface form and therefore sum up
         */
        posInc = 0;
        Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
        if (charTermAttribute != null) {
            charTermAttribute.setLength(0);
            charTermAttribute.append(bytesAtt.toUTF16());
        }
        if (payload != null) {
            payloadAttr.setPayload(this.payload);
        }
        return true;
    }

    return false;
}

Example #23

Source File: ConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}

Example #24

Source File: Dictionary.java From lucene-solr with Apache License 2.0

5 votes

private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
  IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
  FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
    Util.toUTF32(entry.getKey(), scratch);
    List<Integer> entries = entry.getValue();
    IntsRef output = new IntsRef(entries.size());
    for (Integer c : entries) {
      output.ints[output.length++] = c;
    }
    fstCompiler.add(scratch.get(), output);
  }
  return fstCompiler.compile();
}

Example #25

Source File: VersionBlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}

Example #26

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String toString(String field) {
  StringBuilder buffer = new StringBuilder();
  BytesRefBuilder scratch = new BytesRefBuilder();
  for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
    if (buffer.length() != 0) {
      buffer.append(",");
    } else {
      buffer.append("contexts");
      buffer.append(":[");
    }
    buffer.append(Util.toBytesRef(entry.getKey(), scratch).utf8ToString());
    ContextMetaData metaData = entry.getValue();
    if (metaData.exact == false) {
      buffer.append("*");
    }
    if (metaData.boost != 0) {
      buffer.append("^");
      buffer.append(Float.toString(metaData.boost));
    }
  }
  if (buffer.length() != 0) {
    buffer.append("]");
    buffer.append(",");
  }
  return buffer.toString() + innerQuery.toString(field);
}

Example #27

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Adds a context with boost, set <code>exact</code> to false
 * if the context is a prefix of any indexed contexts
 */
public void addContext(CharSequence context, float boost, boolean exact) {
  if (boost < 0f) {
    throw new IllegalArgumentException("'boost' must be >= 0");
  }
  for (int i = 0; i < context.length(); i++) {
    if (ContextSuggestField.CONTEXT_SEPARATOR == context.charAt(i)) {
      throw new IllegalArgumentException("Illegal value [" + context + "] UTF-16 codepoint [0x"
          + Integer.toHexString((int) context.charAt(i))+ "] at position " + i + " is a reserved character");
    }
  }
  contexts.put(IntsRef.deepCopyOf(Util.toIntsRef(new BytesRef(context), scratch)), new ContextMetaData(boost, exact));
  updateRamBytesUsed();
}

Example #28

Source File: FSTTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
  // write term meta data into fst
  final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
  meta.bytes = null;
  meta.docFreq = state.docFreq;
  meta.totalTermFreq = state.totalTermFreq;
  postingsWriter.encodeTerm(metaWriter, fieldInfo, state, true);
  if (metaWriter.size() > 0) {
    meta.bytes = metaWriter.toArrayCopy();
    metaWriter.reset();
  }
  fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
  numTerms++;
}

Example #29

Source File: BaseSynonymParserTestCase.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Helper method to validate synonym parsing.
 *
 * @param synonynMap  the generated synonym map after parsing
 * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
 *                    All spaces will be replaced by word separators.
 * @param includeOrig if synonyms should include original
 * @param synonyms    actual synonyms. All word separators are replaced with a single space.
 */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms)
    throws Exception {
  word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
  BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
  assertNotNull("No synonyms found for: " + word, value);

  ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
  final int code = bytesReader.readVInt();

  final boolean keepOrig = (code & 0x1) == 0;
  assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig,
      includeOrig, keepOrig);

  final int count = code >>> 1;
  assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count,
      synonyms.length, count);

  Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));

  BytesRef scratchBytes = new BytesRef();
  for (int i = 0; i < count; i++) {
    synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
    String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
    assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
  }
}

Example #30

Source File: FSTTermsReader.java From lucene-solr with Apache License 2.0

5 votes

/** Load frame for target arc(node) on fst, so that 
 *  arc.label &gt;= label and !fsa.reject(arc.label) */
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
  FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
  arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
  if (arc == null) {
    return null;
  }
  frame.fsaState = fsa.step(top.fsaState, arc.label());
  //if (TEST) System.out.println(" loadCeil frame="+frame);
  if (frame.fsaState == -1) {
    return loadNextFrame(top, frame);
  }
  frame.output = frame.fstArc.output();
  return frame;
}