Java Code Examples for org.apache.lucene.util.IntsRefBuilder

The following examples show how to use org.apache.lucene.util.IntsRefBuilder. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
public static FST<?> getFST(SortedSet<String> values) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    // The builder options with defaults
    FST.INPUT_TYPE inputType = FST.INPUT_TYPE.BYTE1;
    int minSuffixCount1 = 0;
    int minSuffixCount2 = 0;
    boolean doShareSuffix = true;
    boolean doShareNonSingletonNodes = true;
    int shareMaxTailLength = Integer.MAX_VALUE;
    
    boolean allowArrayArcs = true;
    int bytesPageBits = 15;
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    
    // create the FST from the values
    org.apache.lucene.util.fst.Builder<Object> fstBuilder = new org.apache.lucene.util.fst.Builder<>(inputType, minSuffixCount1, minSuffixCount2,
                    doShareSuffix, doShareNonSingletonNodes, shareMaxTailLength, outputs, allowArrayArcs, bytesPageBits);
    
    for (String value : values) {
        Util.toUTF16(value, irBuilder);
        final IntsRef scratchInt = irBuilder.get();
        fstBuilder.add(scratchInt, outputs.getNoOutput());
    }
    return fstBuilder.finish();
}
 
Example 2
Source Project: ambiverse-nlu   Source File: TrieBuilder.java    License: Apache License 2.0 6 votes vote down vote up
public static FST<Long> buildTrie(Set<String> sortedStrings) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  long outputValue = 0;
  for (String mention : sortedStrings) {
    scratchBytes.copyChars(mention);
    try {
      builder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), outputValue++);
    } catch (java.lang.AssertionError ae) {
      logger.debug("Assertion error for mention " + mention);
    }
  }
  return builder.finish();
}
 
Example 3
Source Project: lucene-solr   Source File: FSTCompletionBuilder.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Builds the final automaton from a list of entries.
 */
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
      .shareMaxTailLength(shareMaxTailLength).build();

  BytesRefBuilder scratch = new BytesRefBuilder();
  BytesRef entry;
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
    count++;
    if (scratch.get().compareTo(entry) != 0) {
      fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
      scratch.copyBytes(entry);
    }
  }
  
  return count == 0 ? null : fstCompiler.compile();
}
 
Example 4
Source Project: lucene-solr   Source File: TokenInfoDictionaryTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testPut() throws Exception {
  TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
                                             // "large" id
                                             "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
  IntsRef wordIdRef = new IntsRefBuilder().get();

  dict.lookupWordIds(0, wordIdRef);
  int wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(5000, dict.getLeftId(wordId));
  assertEquals(5000, dict.getRightId(wordId));
  assertEquals(3, dict.getWordCost(wordId));

  dict.lookupWordIds(1, wordIdRef);
  wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(1, dict.getLeftId(wordId));
  assertEquals(1, dict.getRightId(wordId));
  assertEquals(2, dict.getWordCost(wordId));
}
 
Example 5
Source Project: lucene-solr   Source File: TokenInfoDictionaryTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testPut() throws Exception {
  TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*",
      // "large" id
      "일반,5000,5000,3,NNG,*,*,*,*,*,*,*");
  IntsRef wordIdRef = new IntsRefBuilder().get();

  dict.lookupWordIds(0, wordIdRef);
  int wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(1, dict.getLeftId(wordId));
  assertEquals(1, dict.getRightId(wordId));
  assertEquals(2, dict.getWordCost(wordId));

  dict.lookupWordIds(1, wordIdRef);
  wordId = wordIdRef.ints[wordIdRef.offset];
  assertEquals(5000, dict.getLeftId(wordId));
  assertEquals(5000, dict.getRightId(wordId));
  assertEquals(3, dict.getWordCost(wordId));
}
 
Example 6
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
 
Example 7
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 6 votes vote down vote up
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();
  
  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }
  
  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }
  
  return fstCompiler.compile();
}
 
Example 8
Source Project: lucene-solr   Source File: StemmerOverrideFilter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort();
  IntsRefBuilder intsSpare = new IntsRefBuilder();
  final int size = hash.size();
  BytesRef spare = new BytesRef();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    intsSpare.copyUTF8Bytes(bytesRef);
    fstCompiler.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(fstCompiler.compile(), ignoreCase);
}
 
Example 9
Source Project: lucene-solr   Source File: FiniteStringsIterator.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Constructor.
 *
 * @param a Automaton to create finite string from.
 * @param startState The starting state for each path.
 * @param endState The state where each path should stop or -1 if only accepted states should be final.
 */
public FiniteStringsIterator(Automaton a, int startState, int endState) {
  this.a = a;
  this.endState = endState;
  this.nodes = new PathNode[16];
  for (int i = 0, end = nodes.length; i < end; i++) {
    nodes[i] = new PathNode();
  }
  this.string = new IntsRefBuilder();
  this.pathStates = new BitSet(a.getNumStates());
  this.string.setLength(0);
  this.emitEmptyString = a.isAccept(0);

  // Start iteration with node startState.
  if (a.getNumTransitions(startState) > 0) {
    pathStates.set(startState);
    nodes[0].resetState(a, startState);
    string.append(startState);
  }
}
 
Example 10
Source Project: lucene-solr   Source File: Util.java    License: Apache License 2.0 6 votes vote down vote up
/** Adds all leaving arcs, including 'finished' arc, if
 *  the node is final, from this node into the queue.  */
public void addStartPaths(FST.Arc<T> node, T startOutput, boolean allowEmptyString, IntsRefBuilder input,
                          float boost, CharSequence context, int payload) throws IOException {

  // De-dup NO_OUTPUT since it must be a singleton:
  if (startOutput.equals(fst.outputs.getNoOutput())) {
    startOutput = fst.outputs.getNoOutput();
  }

  FSTPath<T> path = new FSTPath<>(startOutput, node, input, boost, context, payload);
  fst.readFirstTargetArc(node, path.arc, bytesReader);

  // Bootstrap: find the min starting arc
  while (true) {
    if (allowEmptyString || path.arc.label() != FST.END_LABEL) {
      addIfCompetitive(path);
    }
    if (path.arc.isLast()) {
      break;
    }
    fst.readNextArc(path.arc, bytesReader);
  }
}
 
Example 11
Source Project: lucene-solr   Source File: FiniteStringsIteratorTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testFiniteStringsEatsStack() {
  char[] chars = new char[50000];
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString1 = new String(chars);
  TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length);
  String bigString2 = new String(chars);
  Automaton a = Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2));
  FiniteStringsIterator iterator = new FiniteStringsIterator(a);
  List<IntsRef> actual = getFiniteStrings(iterator);
  assertEquals(2, actual.size());
  IntsRefBuilder scratch = new IntsRefBuilder();
  Util.toUTF32(bigString1.toCharArray(), 0, bigString1.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
  Util.toUTF32(bigString2.toCharArray(), 0, bigString2.length(), scratch);
  assertTrue(actual.contains(scratch.get()));
}
 
Example 12
Source Project: lucene-solr   Source File: TestFSTs.java    License: Apache License 2.0 6 votes vote down vote up
public void testDuplicateFSAString() throws Exception {
  String str = "foobar";
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IntsRefBuilder ints = new IntsRefBuilder();
  for(int i=0; i<10; i++) {
    fstCompiler.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput());
  }
  FST<Object> fst = fstCompiler.compile();

  // count the input paths
  int count = 0;
  final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst);
  while(fstEnum.next()!=null) {
    count++;
  }
  assertEquals(1, count);

  assertNotNull(Util.get(fst, new BytesRef(str)));
  assertNull(Util.get(fst, new BytesRef("foobaz")));
}
 
Example 13
Source Project: lucene-solr   Source File: TestFSTs.java    License: Apache License 2.0 6 votes vote down vote up
public void testInternalFinalState() throws Exception {
  final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  fstCompiler.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput());
  fstCompiler.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput());
  final FST<Long> fst = fstCompiler.compile();
  StringWriter w = new StringWriter();
  //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
  Util.toDot(fst, w, false, false);
  w.close();
  //System.out.println(w.toString());

  // check for accept state at label t
  assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1);
  // check for accept state at label n
  assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1);
}
 
Example 14
Source Project: lucene-solr   Source File: TestFSTs.java    License: Apache License 2.0 6 votes vote down vote up
public void testLargeOutputsOnArrayArcs() throws Exception {
  final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  final FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);

  final byte[] bytes = new byte[300];
  final IntsRefBuilder input = new IntsRefBuilder();
  input.append(0);
  final BytesRef output = new BytesRef(bytes);
  for(int arc=0;arc<6;arc++) {
    input.setIntAt(0, arc);
    output.bytes[0] = (byte) arc;
    fstCompiler.add(input.get(), BytesRef.deepCopyOf(output));
  }

  final FST<BytesRef> fst = fstCompiler.compile();
  for(int arc=0;arc<6;arc++) {
    input.setIntAt(0,  arc);
    final BytesRef result = Util.get(fst, input.get());
    assertNotNull(result);
    assertEquals(300, result.length);
    assertEquals(result.bytes[result.offset], arc);
    for(int byteIDX=1;byteIDX<result.length;byteIDX++) {
      assertEquals(0, result.bytes[result.offset+byteIDX]);
    }
  }
}
 
Example 15
Source Project: lucene-solr   Source File: TestFSTs.java    License: Apache License 2.0 6 votes vote down vote up
public void testSimpleDepth() throws Exception {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);

  BytesRef ab = new BytesRef("ab");
  BytesRef ac = new BytesRef("ac");
  BytesRef bd = new BytesRef("bd");

  fstCompiler.add(Util.toIntsRef(ab, new IntsRefBuilder()), 3L);
  fstCompiler.add(Util.toIntsRef(ac, new IntsRefBuilder()), 5L);
  fstCompiler.add(Util.toIntsRef(bd, new IntsRefBuilder()), 7L);

  FST<Long> fst = fstCompiler.compile();

  assertEquals(3, (long) Util.get(fst, ab));
  assertEquals(5, (long) Util.get(fst, ac));
  assertEquals(7, (long) Util.get(fst, bd));
}
 
Example 16
/**
 * Consume a maximal glue morpheme, if any, and consume the next word.
 */
private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder,
                               IntsRefBuilder maxPathsBuilder,
                               Deque<Chunk> chunks) throws IOException {
    FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>());
    BytesReader br = glueMorphemes.getBytesReader();
    for (int i = offset; i < utf32.length; i++) {
        int chr = utf32.ints[i];
        arc = glueMorphemes.findTargetArc(chr, arc, arc, br);
        if (arc == null) {
            break;
        }
        if (arc.isFinal()) {
            chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME));
            if (i + 1 < utf32.offset + utf32.length) {
                matchWord(utf32, i + 1, builder, maxPathsBuilder, chunks);
            }
            chunks.removeLast();
        }
    }
}
 
Example 17
Source Project: datawave   Source File: DatawaveArithmetic.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean matchesFst(Object object, FST fst) throws IOException {
    final IntsRefBuilder irBuilder = new IntsRefBuilder();
    Util.toUTF16(object.toString(), irBuilder);
    final IntsRef ints = irBuilder.get();
    synchronized (fst) {
        return Util.get(fst, ints) != null;
    }
}
 
Example 18
Source Project: lucene-solr   Source File: OrdsBlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<Output> fstCompiler, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Output> indexEnt;
  while ((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    Output output = indexEnt.output;
    //long blockTermCount = output.endOrd - output.startOrd + 1;
    Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
    //System.out.println("  append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount  + " newOutput=" + newOutput  + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
  }
}
 
Example 19
Source Project: lucene-solr   Source File: FSTUtil.java    License: Apache License 2.0 5 votes vote down vote up
/** Sole constructor. */
public Path(int state, FST.Arc<T> fstNode, T output, IntsRefBuilder input) {
  this.state = state;
  this.fstNode = fstNode;
  this.output = output;
  this.input = input;
}
 
Example 20
Source Project: lucene-solr   Source File: TestFSTsMisc.java    License: Apache License 2.0 5 votes vote down vote up
public void testListOfOutputs() throws Exception {
  PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton();
  ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs);
  final FSTCompiler<Object> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);

  final IntsRefBuilder scratch = new IntsRefBuilder();
  // Add the same input more than once and the outputs
  // are merged:
  fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L);
  fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L);
  fstCompiler.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L);
  fstCompiler.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L);
  final FST<Object> fst = fstCompiler.compile();

  Object output = Util.get(fst, new BytesRef("a"));
  assertNotNull(output);
  List<Long> outputList = outputs.asList(output);
  assertEquals(3, outputList.size());
  assertEquals(1L, outputList.get(0).longValue());
  assertEquals(3L, outputList.get(1).longValue());
  assertEquals(0L, outputList.get(2).longValue());

  output = Util.get(fst, new BytesRef("b"));
  assertNotNull(output);
  outputList = outputs.asList(output);
  assertEquals(1, outputList.size());
  assertEquals(17L, outputList.get(0).longValue());
}
 
Example 21
Source Project: lucene-solr   Source File: VersionBlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<Pair<BytesRef,Long>> fstCompiler, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}
 
Example 22
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 5 votes vote down vote up
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
  IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
  FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
    Util.toUTF32(entry.getKey(), scratch);
    List<Integer> entries = entry.getValue();
    IntsRef output = new IntsRef(entries.size());
    for (Integer c : entries) {
      output.ints[output.length++] = c;
    }
    fstCompiler.add(scratch.get(), output);
  }
  return fstCompiler.compile();
}
 
Example 23
Source Project: lucene-solr   Source File: BaseSynonymParserTestCase.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to validate synonym parsing.
 *
 * @param synonynMap  the generated synonym map after parsing
 * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
 *                    All spaces will be replaced by word separators.
 * @param includeOrig if synonyms should include original
 * @param synonyms    actual synonyms. All word separators are replaced with a single space.
 */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms)
    throws Exception {
  word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
  BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
  assertNotNull("No synonyms found for: " + word, value);

  ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
  final int code = bytesReader.readVInt();

  final boolean keepOrig = (code & 0x1) == 0;
  assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig,
      includeOrig, keepOrig);

  final int count = code >>> 1;
  assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count,
      synonyms.length, count);

  Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));

  BytesRef scratchBytes = new BytesRef();
  for (int i = 0; i < count; i++) {
    synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
    String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
    assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
  }
}
 
Example 24
Source Project: lucene-solr   Source File: BooleanPerceptronClassifier.java    License: Apache License 2.0 5 votes vote down vote up
private void updateFST(SortedMap<String, Double> weights) throws IOException {
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  BytesRefBuilder scratchBytes = new BytesRefBuilder();
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String, Double> entry : weights.entrySet()) {
    scratchBytes.copyChars(entry.getKey());
    fstCompiler.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry
            .getValue().longValue());
  }
  fst = fstCompiler.compile();
}
 
Example 25
Source Project: lucene-solr   Source File: AutomatonTestUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns the strings that can be produced from the given state, or
 * false if more than <code>limit</code> strings are found. 
 * <code>limit</code>&lt;0 means "infinite".
 */
private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates, 
    HashSet<IntsRef> strings, IntsRefBuilder path, int limit) {
  pathstates.add(s);
  Transition t = new Transition();
  int count = a.initTransition(s, t);
  for (int i=0;i<count;i++) {
    a.getNextTransition(t);
    if (pathstates.contains(t.dest)) {
      return false;
    }
    for (int n = t.min; n <= t.max; n++) {
      path.append(n);
      if (a.isAccept(t.dest)) {
        strings.add(path.toIntsRef());
        if (limit >= 0 && strings.size() > limit) {
          return false;
        }
      }
      if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) {
        return false;
      }
      path.setLength(path.length() - 1);
    }
  }
  pathstates.remove(s);
  return true;
}
 
Example 26
Source Project: lucene-solr   Source File: FSTTester.java    License: Apache License 2.0 5 votes vote down vote up
static IntsRef toIntsRef(String s, int inputMode, IntsRefBuilder ir) {
  if (inputMode == 0) {
    // utf8
    return toIntsRef(new BytesRef(s), ir);
  } else {
    // utf32
    return toIntsRefUTF32(s, ir);
  }
}
 
Example 27
Source Project: lucene-solr   Source File: FSTTester.java    License: Apache License 2.0 5 votes vote down vote up
static IntsRef toIntsRefUTF32(String s, IntsRefBuilder ir) {
  final int charLength = s.length();
  int charIdx = 0;
  int intIdx = 0;
  ir.clear();
  while(charIdx < charLength) {
    ir.grow(intIdx+1);
    final int utf32 = s.codePointAt(charIdx);
    ir.append(utf32);
    charIdx += Character.charCount(utf32);
    intIdx++;
  }
  return ir.get();
}
 
Example 28
Source Project: lucene-solr   Source File: FSTTester.java    License: Apache License 2.0 5 votes vote down vote up
static IntsRef toIntsRef(BytesRef br, IntsRefBuilder ir) {
  ir.grow(br.length);
  ir.clear();
  for(int i=0;i<br.length;i++) {
    ir.append(br.bytes[br.offset+i]&0xFF);
  }
  return ir.get();
}
 
Example 29
Source Project: lucene-solr   Source File: FSTTester.java    License: Apache License 2.0 5 votes vote down vote up
private T randomAcceptedWord(FST<T> fst, IntsRefBuilder in) throws IOException {
  FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

  final List<FST.Arc<T>> arcs = new ArrayList<>();
  in.clear();
  final T NO_OUTPUT = fst.outputs.getNoOutput();
  T output = NO_OUTPUT;
  final FST.BytesReader fstReader = fst.getBytesReader();

  while(true) {
    // read all arcs:
    fst.readFirstTargetArc(arc, arc, fstReader);
    arcs.add(new FST.Arc<T>().copyFrom(arc));
    while(!arc.isLast()) {
      fst.readNextArc(arc, fstReader);
      arcs.add(new FST.Arc<T>().copyFrom(arc));
    }
    
    // pick one
    arc = arcs.get(random.nextInt(arcs.size()));
    arcs.clear();

    // accumulate output
    output = fst.outputs.add(output, arc.output());

    // append label
    if (arc.label() == FST.END_LABEL) {
      break;
    }

    in.append(arc.label());
  }

  return output;
}
 
Example 30
Source Project: lucene-solr   Source File: BlockTreeTermsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void append(FSTCompiler<BytesRef> fstCompiler, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
  final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
  BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
  while((indexEnt = subIndexEnum.next()) != null) {
    //if (DEBUG) {
    //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
    //}
    fstCompiler.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
  }
}