org.apache.lucene.search.suggest.InputIterator Java Examples

The following examples show how to use org.apache.lucene.search.suggest.InputIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JaspellLookup.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  count = 0;
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRefBuilder charsSpare = new CharsRefBuilder();

  while ((spare = iterator.next()) != null) {
    final long weight = iterator.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.copyUTF8Bytes(spare);
    trie.put(charsSpare.toString(), weight);
    count++;
  }
}
 
Example #2
Source File: TSTLookup.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  root = new TernaryTreeNode();

  // make sure it's sorted and the comparator uses UTF16 sort order
  iterator = new SortedInputIterator(tempDir, tempFileNamePrefix, iterator, utf8SortedAsUTF16SortOrder);
  count = 0;
  ArrayList<String> tokens = new ArrayList<>();
  ArrayList<Number> vals = new ArrayList<>();
  BytesRef spare;
  CharsRefBuilder charsSpare = new CharsRefBuilder();
  while ((spare = iterator.next()) != null) {
    charsSpare.copyUTF8Bytes(spare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(iterator.weight()));
    count++;
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
 
Example #3
Source File: LuceneDictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final InputIterator getEntryIterator() throws IOException {
  final Terms terms = MultiTerms.getTerms(reader, field);
  if (terms != null) {
    return new InputIterator.InputIteratorWrapper(terms.iterator());
  } else {
    return InputIterator.EMPTY;
  }
}
 
Example #4
Source File: FreeTextSuggester.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
}
 
Example #5
Source File: FSTCompletionLookup.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }

  OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
  ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
  IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
  String tempSortedFileName = null;

  OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
  OfflineSorter.ByteSequencesReader reader = null;

  // Push floats up front before sequences to sort them. For now, assume they are non-negative.
  // If negative floats are allowed some trickery needs to be done to find their byte order.
  count = 0;
  try {
    byte [] buffer = new byte [0];
    ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
    BytesRef spare;
    int inputLineCount = 0;
    while ((spare = iterator.next()) != null) {
      if (spare.length + 4 >= buffer.length) {
        buffer = ArrayUtil.grow(buffer, spare.length + 4);
      }

      output.reset(buffer);
      output.writeInt(encodeWeight(iterator.weight()));
      output.writeBytes(spare.bytes, spare.offset, spare.length);
      writer.write(buffer, 0, output.getPosition());
      inputLineCount++;
    }
    CodecUtil.writeFooter(tempInput);
    writer.close();

    // We don't know the distribution of scores and we need to bucket them, so we'll sort
    // and divide into equal buckets.
    tempSortedFileName = sorter.sort(tempInput.getName());
    tempDir.deleteFile(tempInput.getName());

    FSTCompletionBuilder builder = new FSTCompletionBuilder(
        buckets, externalSorter, sharedTailLength);

    reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
    long line = 0;
    int previousBucket = 0;
    int previousScore = 0;
    ByteArrayDataInput input = new ByteArrayDataInput();
    BytesRef tmp2 = new BytesRef();
    while (true) {
      BytesRef scratch = reader.next();
      if (scratch == null) {
        break;
      }
      input.reset(scratch.bytes, scratch.offset, scratch.length);
      int currentScore = input.readInt();

      int bucket;
      if (line > 0 && currentScore == previousScore) {
        bucket = previousBucket;
      } else {
        bucket = (int) (line * buckets / inputLineCount);
      }
      previousScore = currentScore;
      previousBucket = bucket;

      // Only append the input, discard the weight.
      tmp2.bytes = scratch.bytes;
      tmp2.offset = scratch.offset + input.getPosition();
      tmp2.length = scratch.length - input.getPosition();
      builder.add(tmp2, bucket);

      line++;
      count++;
    }

    // The two FSTCompletions share the same automaton.
    this.higherWeightsCompletion = builder.build();
    this.normalCompletion = new FSTCompletion(
        higherWeightsCompletion.getFST(), false, exactMatchFirst);
    
  } finally {
    IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
    IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
  }
}
 
Example #6
Source File: WFSTCompletionLookup.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
WFSTInputIterator(Directory tempDir, String tempFileNamePrefix, InputIterator source) throws IOException {
  super(tempDir, tempFileNamePrefix, source);
  assert source.hasPayloads() == false;
}
 
Example #7
Source File: PlainTextDictionary.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public InputIterator getEntryIterator() throws IOException {
  return new InputIterator.InputIteratorWrapper(new FileIterator());
}
 
Example #8
Source File: HighFrequencyDictionary.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public final InputIterator getEntryIterator() throws IOException {
  return new HighFrequencyIterator();
}
 
Example #9
Source File: TestFreeTextSuggester.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Ignore
public void testWiki() throws Exception {
  final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt");
  // Skip header:
  lfd.nextDoc();
  Analyzer analyzer = new MockAnalyzer(random());
  FreeTextSuggester sug = new FreeTextSuggester(analyzer);
  sug.build(new InputIterator() {

      private int count;

      @Override
      public long weight() {
        return 1;
      }

      @Override
      public BytesRef next() {
        Document doc;
        try {
          doc = lfd.nextDoc();
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
        if (doc == null) {
          return null;
        }
        if (count++ == 10000) {
          return null;
        }
        return new BytesRef(doc.get("body"));
      }

      @Override
      public BytesRef payload() {
        return null;
      }

      @Override
      public boolean hasPayloads() {
        return false;
      }

      @Override
      public Set<BytesRef> contexts() {
        return null;
      }

      @Override
      public boolean hasContexts() {
        return false;
      }
    });
  if (VERBOSE) {
    System.out.println(sug.ramBytesUsed() + " bytes");

    List<LookupResult> results = sug.lookup("general r", 10);
    System.out.println("results:");
    for(LookupResult result : results) {
      System.out.println("  " + result);
    }
  }
  analyzer.close();
  lfd.close();
}
 
Example #10
Source File: RandomTestDictionaryFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public InputIterator getEntryIterator() throws IOException {
  return new InputIterator.InputIteratorWrapper(new RandomByteRefIterator());
}
 
Example #11
Source File: Dictionary.java    From lucene-solr with Apache License 2.0 2 votes vote down vote up
/**
 * Returns an iterator over all the entries
 * @return Iterator
 */
InputIterator getEntryIterator() throws IOException;