Java Code Examples for org.apache.lucene.util.fst.Util

The following examples show how to use org.apache.lucene.util.fst.Util. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
public static FST<?> getFST(SortedSet<String> values) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    // The builder options with defaults
    FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1;
    int minSuffixCount1 = 0;
    int minSuffixCount2 = 0;
    boolean doShareSuffix = true;
    boolean doShareNonSingletonNodes = true;
    int shareMaxTailLength = Integer.MAX_VALUE;
    
    boolean allowArrayArcs = true;
    int bytesPageBits = 15;
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    
    // create the FST from the values
    org.apache.lucene.util.fst.Builder<Object> fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2,
                    doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits);
    
    for (String value : values) {
        Util.toUTF16(value, irBuilder);
        final IntsRef scratchInt = irBuilder.get();
        fstBuilder.add(scratchInt, outputs.getNoOutput());
    }
    return fstBuilder.finish();
}
 
Example 2
Source Project: ambiverse-nlu   Source File: TrieBuilder.java    License: Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example 3
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 6 votes vote down vote up
public void finishTerm(long defaultWeight) throws IOException {
    ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
    int deduplicator = 0;
    analyzed.append((byte) 0);
    analyzed.setLength(analyzed.length() + 1);
    analyzed.grow(analyzed.length());
    for (int i = 0; i < count; i++) {
        analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
        Util.toIntsRef(analyzed.get(), scratchInts);
        SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
        long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
        builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
    }
    seenSurfaceForms.clear();
    count = 0;
}
 
Example 4
Source Project: lucene-solr   Source File: NRTSuggesterBuilder.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Writes all the entries for the FST input term
 */
public void finishTerm() throws IOException {
  int numArcs = 0;
  int numDedupBytes = 1;
  analyzed.grow(analyzed.length() + 1);
  analyzed.setLength(analyzed.length() + 1);
  for (Entry entry : entries) {
    if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
      analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
      analyzed.grow(analyzed.length() + 1);
      analyzed.setLength(analyzed.length() + 1);
      numArcs = 0;
      numDedupBytes++;
    }
    analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    fstCompiler.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
  }
  maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
  entries.clear();
}
 
Example 5
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
 
Example 6
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 6 votes vote down vote up
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();
  
  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }
  
  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }
  
  return fstCompiler.compile();
}
 
Example 7
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public ClassificationResult<Boolean> assignClass(String text)
        throws IOException {
  Long output = 0L;
  try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
    CharTermAttribute charTermAttribute = tokenStream
            .addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
      String s = charTermAttribute.toString();
      Long d = Util.get(fst, new BytesRef(s));
      if (d != null) {
        output += d;
      }
    }
    tokenStream.end();
  }

  double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
  return new ClassificationResult<>(output >= bias, score);
}
 
Example 8
Source Project: lucene-solr   Source File: FiniteStringsIteratorTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testFiniteStringsEatsStack() {
  char[] chars = new char[50000];
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString1 = new String(chars);
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString2 = new String(chars);
  Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
  Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
}
 
Example 9
Source Project: datawave   Source File: DatawaveArithmetic.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}
 
Example 10
Source Project: Elasticsearch   Source File: CompletionTokenStream.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    if (finiteStrings == null) {
        Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input);

        if (strings.size() > MAX_PATHS) {
            throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS
                    + " finite strings are supported");
        }
        posInc = strings.size();
        finiteStrings = strings.iterator();
    }
    if (finiteStrings.hasNext()) {
        posAttr.setPositionIncrement(posInc);
        /*
         * this posInc encodes the number of paths that this surface form
         * produced. Multi Fields have the same surface form and therefore sum up
         */
        posInc = 0;
        Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
        if (charTermAttribute != null) {
            charTermAttribute.setLength(0);
            charTermAttribute.append(bytesAtt.toUTF16());
        }
        if (payload != null) {
            payloadAttr.setPayload(this.payload);
        }
        return true;
    }

    return false;
}
 
Example 11
Source Project: lucene-solr   Source File: FSTTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
  // write term meta data into fst
  final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
  meta.bytes = null;
  meta.docFreq = state.docFreq;
  meta.totalTermFreq = state.totalTermFreq;
  postingsWriter.encodeTerm(metaWriter, fieldInfo, state, true);
  if (metaWriter.size() > 0) {
    meta.bytes = metaWriter.toArrayCopy();
    metaWriter.reset();
  }
  fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta);
  numTerms++;
}
 
Example 12
Source Project: lucene-solr   Source File: FSTTermsReader.java    License: Apache License 2.0 5 votes vote down vote up
/** Load frame for target arc(node) on fst, so that 
 *  arc.label &gt;= label and !fsa.reject(arc.label) */
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
  FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
  arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
  if (arc == null) {
    return null;
  }
  frame.fsaState = fsa.step(top.fsaState, arc.label());
  //if (TEST) System.out.println(" loadCeil frame="+frame);
  if (frame.fsaState == -1) {
    return loadNextFrame(top, frame);
  }
  frame.output = frame.fstArc.output();
  return frame;
}
 
Example 13
Source Project: lucene-solr   Source File: OrdsBlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Output> indexEnt;
  while ((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    Output output = indexEnt.output;
    //long blockTermCount = output.endOrd - output.startOrd + 1;
    Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
    //System.out.println("  append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount  + " newOutput=" + newOutput  + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
  }
}
 
Example 14
Source Project: lucene-solr   Source File: ContextQuery.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Adds a context with boost, set <code>exact</code> to false
 * if the context is a prefix of any indexed contexts
 */
public void addContext(CharSequence context, float boost, boolean exact) {
  if (boost < 0f) {
    throw new IllegalArgumentException("'boost' must be >= 0");
  }
  for (int i = 0; i < context.length(); i++) {
    if (ContextSuggestField.CONTEXT_SEPARATOR == context.charAt(i)) {
      throw new IllegalArgumentException("Illegal value [" + context + "] UTF-16 codepoint [0x"
          + Integer.toHexString((int) context.charAt(i))+ "] at position " + i + " is a reserved character");
    }
  }
  contexts.put(IntsRef.deepCopyOf(Util.toIntsRef(new BytesRef(context), scratch)), new ContextMetaData(boost, exact));
  updateRamBytesUsed();
}
 
Example 15
Source Project: lucene-solr   Source File: ContextQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public String toString(String field) {
  StringBuilder buffer = new StringBuilder();
  BytesRefBuilder scratch = new BytesRefBuilder();
  for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
    if (buffer.length() != 0) {
      buffer.append(",");
    } else {
      buffer.append("contexts");
      buffer.append(":[");
    }
    buffer.append(Util.toBytesRef(entry.getKey(), scratch).utf8ToString());
    ContextMetaData metaData = entry.getValue();
    if (metaData.exact == false) {
      buffer.append("*");
    }
    if (metaData.boost != 0) {
      buffer.append("^");
      buffer.append(Float.toString(metaData.boost));
    }
  }
  if (buffer.length() != 0) {
    buffer.append("]");
    buffer.append(",");
  }
  return buffer.toString() + innerQuery.toString(field);
}
 
Example 16
Source Project: lucene-solr   Source File: VersionBlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}
 
Example 17
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 5 votes vote down vote up
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
  IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
  FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
    Util.toUTF32(entry.getKey(), scratch);
    List<Integer> entries = entry.getValue();
    IntsRef output = new IntsRef(entries.size());
    for (Integer c : entries) {
      output.ints[output.length++] = c;
    }
    fstCompiler.add(scratch.get(), output);
  }
  return fstCompiler.compile();
}
 
Example 18
Source Project: lucene-solr   Source File: ConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}
 
Example 19
Source Project: lucene-solr   Source File: BaseSynonymParserTestCase.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to validate synonym parsing.
 *
 * @param synonynMap  the generated synonym map after parsing
 * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
 *                    All spaces will be replaced by word separators.
 * @param includeOrig if synonyms should include original
 * @param synonyms    actual synonyms. All word separators are replaced with a single space.
 */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms)
    throws Exception {
  word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
  BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
  assertNotNull("No synonyms found for: " + word, value);

  ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
  final int code = bytesReader.readVInt();

  final boolean keepOrig = (code & 0x1) == 0;
  assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig,
      includeOrig, keepOrig);

  final int count = code >>> 1;
  assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count,
      synonyms.length, count);

  Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));

  BytesRef scratchBytes = new BytesRef();
  for (int i = 0; i < count; i++) {
    synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
    String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
    assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
  }
}
 
Example 20
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 5 votes vote down vote up
private void updateWeights(IndexReader indexReader,
                           int docId, Boolean assignedClass, SortedMap<String, Double> weights,
                           double modifier, boolean updateFST) throws IOException {
  TermsEnum cte = textTerms.iterator();

  // get the doc term vectors
  Terms terms = indexReader.getTermVector(docId, textFieldName);

  if (terms == null) {
    throw new IOException("term vectors must be stored for field "
            + textFieldName);
  }

  TermsEnum termsEnum = terms.iterator();

  BytesRef term;

  while ((term = termsEnum.next()) != null) {
    cte.seekExact(term);
    if (assignedClass != null) {
      long termFreqLocal = termsEnum.totalTermFreq();
      // update weights
      Long previousValue = Util.get(fst, term);
      String termString = term.utf8ToString();
      weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
    }
  }
  if (updateFST) {
    updateFST(weights);
  }
}
 
Example 21
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 5 votes vote down vote up
private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}
 
Example 22
Source Project: lucene-solr   Source File: BaseTokenStreamTestCase.java    License: Apache License 2.0 5 votes vote down vote up
/** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
  Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
  Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: actualStringPaths) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}
 
Example 23
Source Project: lucene-solr   Source File: BlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}
 
Example 24
Source Project: lucene-solr   Source File: TestGraphTokenizers.java    License: Apache License 2.0 5 votes vote down vote up
/** Returns all paths */
private Set<String> toPathStrings(Automaton a) {
  BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
  Set<String> paths = new HashSet<>();
  for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
    paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
  }
  return paths;
}
 
Example 25
Source Project: lucene-solr   Source File: FiniteStringsIteratorTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Basic test for getFiniteStrings
 */
public void testFiniteStringsBasic() {
  Automaton a = Operations.union(Automata.makeString("dog"), Automata.makeString("duck"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertFiniteStringsRecursive(a, actual);
  assertEquals(2, actual.size());
  IntsRefBuilder dog = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("dog"), dog);
  assertTrue(actual.contains(dog.get()));
  IntsRefBuilder duck = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("duck"), duck);
  assertTrue(actual.contains(duck.get()));
}
 
Example 26
Source Project: lucene-solr   Source File: FiniteStringsIteratorTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testSingletonNoLimit() {
  Automaton a = Automata.makeString("foobar");
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(1, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32("foobar".toCharArray(), 0, 6, scratch);
  assertTrue(actual.contains(scratch.get()));
}
 
Example 27
Source Project: lucene-solr   Source File: FiniteStringsIteratorTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testShortAccept() {
  Automaton a = Operations.union(Automata.makeString("x"), Automata.makeString("xy"));
  a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder x = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("x"), x);
  assertTrue(actual.contains(x.get()));
  IntsRefBuilder xy = new IntsRefBuilder();
  Util.toIntsRef(new BytesRef("xy"), xy);
  assertTrue(actual.contains(xy.get()));
}
 
Example 28
Source Project: lucene-solr   Source File: TestAutomaton.java    License: Apache License 2.0 5 votes vote down vote up
private void assertMatches(Automaton a, String... strings) {
  Set<IntsRef> expected = new HashSet<>();
  for(String s : strings) {
    IntsRefBuilder ints = new IntsRefBuilder();
    expected.add(Util.toUTF32(s, ints));
  }

  assertEquals(expected, TestOperations.getFiniteStrings(
      Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES)));
}
 
Example 29
Source Project: lucene-solr   Source File: TestAutomaton.java    License: Apache License 2.0 5 votes vote down vote up
public void testMakeBinaryIntervalRandom() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    BytesRef minTerm = TestUtil.randomBinaryTerm(random());
    boolean minInclusive = random().nextBoolean();
    BytesRef maxTerm = TestUtil.randomBinaryTerm(random());
    boolean maxInclusive = random().nextBoolean();

    Automaton a = makeBinaryInterval(minTerm, minInclusive, maxTerm, maxInclusive);

    for(int iter2=0;iter2<500;iter2++) {
      BytesRef term = TestUtil.randomBinaryTerm(random());
      int minCmp = minTerm.compareTo(term);
      int maxCmp = maxTerm.compareTo(term);

      boolean expected;
      if (minCmp > 0 || maxCmp < 0) {
        expected = false;
      } else if (minCmp == 0 && maxCmp == 0) {
        expected = minInclusive && maxInclusive;
      } else if (minCmp == 0) {
        expected = minInclusive;
      } else if (maxCmp == 0) {
        expected = maxInclusive;
      } else {
        expected = true;
      }

      if (VERBOSE) {
        System.out.println("  check term=" + term + " expected=" + expected);
      }
      IntsRefBuilder intsBuilder = new IntsRefBuilder();
      Util.toIntsRef(term, intsBuilder);
      assertEquals(expected, Operations.run(a, intsBuilder.toIntsRef()));
    }
  }
}
 
Example 30
Source Project: lucene-solr   Source File: TestUTF32ToUTF8.java    License: Apache License 2.0 5 votes vote down vote up
public void testSingleton() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    Automaton a = Automata.makeString(s);
    Automaton utf8 = new UTF32ToUTF8().convert(a);
    IntsRefBuilder ints = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef(s), ints);
    Set<IntsRef> set = new HashSet<>();
    set.add(ints.get());
    assertEquals(set, TestOperations.getFiniteStrings(utf8));
  }
}