Java Code Examples for org.apache.lucene.document.FieldType#setStoreTermVectorOffsets()

The following examples show how to use org.apache.lucene.document.FieldType#setStoreTermVectorOffsets() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0

6 votes

private void checkTokens(Token[] tokens) throws IOException {
  Directory dir = newDirectory();
  RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
  boolean success = false;
  try {
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    // store some term vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
   
    Document doc = new Document();
    doc.add(new Field("body", new CannedTokenStream(tokens), ft));
    riw.addDocument(doc);
    riw.close();
    success = true;
  } finally {
    if (success) {
      IOUtils.close(dir);
    } else {
      IOUtils.closeWhileHandlingException(riw, dir);
    }
  }
}

Example 2

Source File: TestCustomTermFreq.java From lucene-solr with Apache License 2.0

6 votes

public void testInvalidTermVectorOffsets() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));

  Document doc = new Document();
  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
  fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  fieldType.setStoreTermVectors(true);
  fieldType.setStoreTermVectorOffsets(true);
  Field field = new Field("field",
                          new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
                                              new int[] {42, 128, 17, 100}),
                          fieldType);
  doc.add(field);
  Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
  assertEquals("field \"field\": cannot index term vector offsets while using custom TermFrequencyAttribute", e.getMessage());
  IOUtils.close(w, dir);
}

Example 3

Source File: TestPerFieldPostingsFormat2.java From lucene-solr with Apache License 2.0

6 votes

private void doTestMixedPostings(Codec codec) throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
  iwc.setCodec(codec);
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  // turn on vectors for the checkindex cross-check
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field idField = new Field("id", "", ft);
  Field dateField = new Field("date", "", ft);
  doc.add(idField);
  doc.add(dateField);
  for (int i = 0; i < 100; i++) {
    idField.setStringValue(Integer.toString(random().nextInt(50)));
    dateField.setStringValue(Integer.toString(random().nextInt(100)));
    iw.addDocument(doc);
  }
  iw.close();
  dir.close(); // checkindex
}

Example 4

Source File: TestDirectoryReader.java From lucene-solr with Apache License 2.0

6 votes

static void addDocumentWithTermVectorFields(IndexWriter writer) throws IOException
{
    Document doc = new Document();
    FieldType customType5 = new FieldType(TextField.TYPE_STORED);
    customType5.setStoreTermVectors(true);
    FieldType customType6 = new FieldType(TextField.TYPE_STORED);
    customType6.setStoreTermVectors(true);
    customType6.setStoreTermVectorOffsets(true);
    FieldType customType7 = new FieldType(TextField.TYPE_STORED);
    customType7.setStoreTermVectors(true);
    customType7.setStoreTermVectorPositions(true);
    FieldType customType8 = new FieldType(TextField.TYPE_STORED);
    customType8.setStoreTermVectors(true);
    customType8.setStoreTermVectorOffsets(true);
    customType8.setStoreTermVectorPositions(true);
    doc.add(newTextField("tvnot", "tvnot", Field.Store.YES));
    doc.add(newField("termvector","termvector",customType5));
    doc.add(newField("tvoffset","tvoffset", customType6));
    doc.add(newField("tvposition","tvposition", customType7));
    doc.add(newField("tvpositionoffset","tvpositionoffset", customType8));
    
    writer.addDocument(doc);
}

Example 5

Source File: ClassificationTestBase.java From lucene-solr with Apache License 2.0

5 votes

@Override
@Before
public void setUp() throws Exception {
  super.setUp();
  dir = newDirectory();
  indexWriter = new RandomIndexWriter(random(), dir);
  textFieldName = "text";
  categoryFieldName = "cat";
  booleanFieldName = "bool";
  ft = new FieldType(TextField.TYPE_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
}

Example 6

Source File: TestTermVectorsWriter.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffsetPositionStandardEmptyField() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  Field f = newField("field", "", customType);
  Field f2 = newField("field", "crunch man", customType);
  doc.add(f);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator();
  assertNotNull(termsEnum.next());
  PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);

  assertEquals(1, (int) termsEnum.totalTermFreq());
  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(1, dpEnum.startOffset());
  assertEquals(7, dpEnum.endOffset());

  assertNotNull(termsEnum.next());
  dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(11, dpEnum.endOffset());

  r.close();
  dir.close();
}

Example 7

Source File: TestTermVectorsWriter.java From lucene-solr with Apache License 2.0

5 votes

public void testEndOffsetPositionCharAnalyzer() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  Field f = newField("field", "abcd   ", customType);
  doc.add(f);
  doc.add(f);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator();
  assertNotNull(termsEnum.next());
  PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);
  assertEquals(2, termsEnum.totalTermFreq());

  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(0, dpEnum.startOffset());
  assertEquals(4, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(12, dpEnum.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

  r.close();
  dir.close();
}

Example 8

Source File: FastVectorHighlighterTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSimpleHighlightTest() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType type = new FieldType(TextField.TYPE_STORED);
  type.setStoreTermVectorOffsets(true);
  type.setStoreTermVectorPositions(true);
  type.setStoreTermVectors(true);
  type.freeze();
  Field field = new Field("field", "This is a test where foo is highlighed and should be highlighted", type);
  
  doc.add(field);
  writer.addDocument(doc);
  FastVectorHighlighter highlighter = new FastVectorHighlighter();
  
  IndexReader reader = DirectoryReader.open(writer);
  int docId = 0;
  FieldQuery fieldQuery  = highlighter.getFieldQuery( new TermQuery(new Term("field", "foo")), reader );
  String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  // highlighted results are centered 
  assertEquals("This is a test where <b>foo</b> is highlighed and should be highlighted", bestFragments[0]);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 52, 1);
  assertEquals("This is a test where <b>foo</b> is highlighed and should be", bestFragments[0]);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 30, 1);
  assertEquals("a test where <b>foo</b> is highlighed", bestFragments[0]);
  reader.close();
  writer.close();
  dir.close();
}

Example 9

Source File: StringIndexConverter.java From jstarcraft-core with Apache License 2.0

5 votes

@Override
public Iterable<IndexableField> convert(LuceneContext context, String path, Field field, LuceneIndex annotation, Type type, Object data) {
    Collection<IndexableField> indexables = new LinkedList<>();
    FieldType configuration = new FieldType();
    configuration.setIndexOptions(IndexOptions.DOCS);
    if (annotation.analyze()) {
        configuration.setTokenized(true);

        LuceneTerm negative = annotation.negative();
        if (negative.offset()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else if (negative.position()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        } else if (negative.frequency()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        LuceneTerm positive = annotation.positive();
        if (positive.offset()) {
            configuration.setStoreTermVectorOffsets(true);
        }
        if (positive.position()) {
            configuration.setStoreTermVectorPositions(true);
        }
        if (positive.frequency()) {
            configuration.setStoreTermVectors(true);
        }
    }
    indexables.add(new org.apache.lucene.document.Field(path, (String) data, configuration));
    return indexables;
}

Example 10

Source File: HighlighterPhraseTest.java From lucene-solr with Apache License 2.0

5 votes

public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException {
  final String TEXT = "the fox did not jump";
  final Directory directory = newDirectory();
  final IndexWriter indexWriter = new IndexWriter(directory,
      newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
  try {
    final Document document = new Document();

    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectorOffsets(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectors(true);
    document.add(new Field(FIELD, new TokenStreamSparse(), customType));
    indexWriter.addDocument(document);
  } finally {
    indexWriter.close();
  }
  final IndexReader indexReader = DirectoryReader.open(directory);
  try {
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "did", "jump");
    TopDocs hits = indexSearcher.search(phraseQuery, 1);
    assertEquals(0, hits.totalHits.value);
    final Highlighter highlighter = new Highlighter(
        new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
        new QueryScorer(phraseQuery));
    final TokenStream tokenStream =
        TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
    assertEquals(
        highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
        highlighter.getBestFragment(tokenStream, TEXT));
  } finally {
    indexReader.close();
    directory.close();
  }
}

Example 11

Source File: TestDirectoryReader.java From lucene-solr with Apache License 2.0

5 votes

public void testTermVectors() throws Exception {
  Directory d = newDirectory();
  // set up writer
  IndexWriter writer = new IndexWriter(
                                       d,
                                       newIndexWriterConfig(new MockAnalyzer(random()))
                                       .setMergePolicy(newLogMergePolicy())
                                       );
  // want to get some more segments here
  // new termvector fields
  int mergeFactor = ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor();
  FieldType customType5 = new FieldType(TextField.TYPE_STORED);
  customType5.setStoreTermVectors(true);
  FieldType customType6 = new FieldType(TextField.TYPE_STORED);
  customType6.setStoreTermVectors(true);
  customType6.setStoreTermVectorOffsets(true);
  FieldType customType7 = new FieldType(TextField.TYPE_STORED);
  customType7.setStoreTermVectors(true);
  customType7.setStoreTermVectorPositions(true);
  FieldType customType8 = new FieldType(TextField.TYPE_STORED);
  customType8.setStoreTermVectors(true);
  customType8.setStoreTermVectorOffsets(true);
  customType8.setStoreTermVectorPositions(true);
  for (int i = 0; i < 5 * mergeFactor; i++) {
    Document doc = new Document();
    doc.add(new TextField("tvnot", "one two two three three three", Field.Store.YES));
    doc.add(new Field("termvector", "one two two three three three", customType5));
    doc.add(new Field("tvoffset", "one two two three three three", customType6));
    doc.add(new Field("tvposition", "one two two three three three", customType7));
    doc.add(new Field("tvpositionoffset", "one two two three three three", customType8));
    
    writer.addDocument(doc);
  }
  writer.close();
  d.close();
}

Example 12

Source File: TestTermVectorsWriter.java From lucene-solr with Apache License 2.0

5 votes

public void testDoubleOffsetCounting2() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  Field f = newField("field", "abcd", customType);
  doc.add(f);
  doc.add(f);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator();
  assertNotNull(termsEnum.next());
  PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);
  assertEquals(2, termsEnum.totalTermFreq());

  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(0, dpEnum.startOffset());
  assertEquals(4, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(5, dpEnum.startOffset());
  assertEquals(9, dpEnum.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

  r.close();
  dir.close();
}

Example 13

Source File: TestIndexWriterMerging.java From lucene-solr with Apache License 2.0

4 votes

public void testForceMergeDeletes2() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(
      dir,
      newIndexWriterConfig(new MockAnalyzer(random()))
        .setMaxBufferedDocs(2)
        .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
        .setMergePolicy(newLogMergePolicy(50))
  );

  Document document = new Document();

  FieldType customType = new FieldType();
  customType.setStored(true);

  FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED);
  customType1.setTokenized(false);
  customType1.setStoreTermVectors(true);
  customType1.setStoreTermVectorPositions(true);
  customType1.setStoreTermVectorOffsets(true);
  
  Field storedField = newField("stored", "stored", customType);
  document.add(storedField);
  Field termVectorField = newField("termVector", "termVector", customType1);
  document.add(termVectorField);
  Field idField = newStringField("id", "", Field.Store.NO);
  document.add(idField);
  for(int i=0;i<98;i++) {
    idField.setStringValue("" + i);
    writer.addDocument(document);
  }
  writer.close();

  IndexReader ir = DirectoryReader.open(dir);
  assertEquals(98, ir.maxDoc());
  assertEquals(98, ir.numDocs());
  ir.close();
  
  IndexWriterConfig dontMergeConfig = new IndexWriterConfig(new MockAnalyzer(random()))
    .setMergePolicy(NoMergePolicy.INSTANCE);
  writer = new IndexWriter(dir, dontMergeConfig);
  for(int i=0;i<98;i+=2) {
    writer.deleteDocuments(new Term("id", "" + i));
  }
  writer.close();
  
  ir = DirectoryReader.open(dir);
  assertEquals(49, ir.numDocs());
  ir.close();

  writer = new IndexWriter(
      dir,
      newIndexWriterConfig(new MockAnalyzer(random()))
        .setMergePolicy(newLogMergePolicy(3))
  );
  assertEquals(49, writer.getDocStats().numDocs);
  writer.forceMergeDeletes();
  writer.close();
  ir = DirectoryReader.open(dir);
  assertEquals(49, ir.maxDoc());
  assertEquals(49, ir.numDocs());
  ir.close();
  dir.close();
}

Example 14

Source File: TestAddIndexes.java From lucene-solr with Apache License 2.0

4 votes

public void testHangOnClose() throws IOException {

    Directory dir = newDirectory();
    LogByteSizeMergePolicy lmp = new LogByteSizeMergePolicy();
    lmp.setNoCFSRatio(0.0);
    lmp.setMergeFactor(100);
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
        new MockAnalyzer(random()))
        .setMaxBufferedDocs(5).setMergePolicy(lmp));

    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorOffsets(true);
    doc.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType));
    for(int i=0;i<60;i++)
      writer.addDocument(doc);

    Document doc2 = new Document();
    FieldType customType2 = new FieldType();
    customType2.setStored(true);
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    doc2.add(newField("content", "aaa bbb ccc ddd eee fff ggg hhh iii", customType2));
    for(int i=0;i<10;i++)
      writer.addDocument(doc2);
    writer.close();

    Directory dir2 = newDirectory();
    lmp = new LogByteSizeMergePolicy();
    lmp.setMinMergeMB(0.0001);
    lmp.setNoCFSRatio(0.0);
    lmp.setMergeFactor(4);
    writer = new IndexWriter(dir2, newIndexWriterConfig(new MockAnalyzer(random()))
        .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(lmp));
    writer.addIndexes(dir);
    writer.close();
    dir.close();
    dir2.close();
  }

Example 15

Source File: TokenSourcesTest.java From lucene-solr with Apache License 2.0

4 votes

public void testOverlapWithPositionsAndOffsetExactPhrase()
    throws IOException, InvalidTokenOffsetsException {
  final String TEXT = "the fox did not jump";
  final Directory directory = newDirectory();
  final IndexWriter indexWriter = new IndexWriter(directory,
      newIndexWriterConfig(null));
  try {
    final Document document = new Document();
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorOffsets(true);
    document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
    indexWriter.addDocument(document);
  } finally {
    indexWriter.close();
  }
  final IndexReader indexReader = DirectoryReader.open(directory);
  try {
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
    // query.add(new SpanTermQuery(new Term(FIELD, "the")));
    // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
    final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
        new SpanTermQuery(new Term(FIELD, "the")),
        new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);

    TopDocs hits = indexSearcher.search(phraseQuery, 1);
    assertEquals(1, hits.totalHits.value);
    final Highlighter highlighter = new Highlighter(
        new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
        new QueryScorer(phraseQuery));
    final TokenStream tokenStream =
        TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
    assertEquals("<B>the fox</B> did not jump",
        highlighter.getBestFragment(tokenStream, TEXT));
  } finally {
    indexReader.close();
    directory.close();
  }
}

Example 16

Source File: TestIndexWriterWithThreads.java From lucene-solr with Apache License 2.0

4 votes

public void _testSingleThreadFailure(MockDirectoryWrapper.Failure failure) throws IOException {
  MockDirectoryWrapper dir = newMockDirectory();

  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
    .setMaxBufferedDocs(2)
    .setMergeScheduler(new ConcurrentMergeScheduler())
    .setCommitOnClose(false);

  if (iwc.getMergeScheduler() instanceof ConcurrentMergeScheduler) {
    iwc.setMergeScheduler(new SuppressingConcurrentMergeScheduler() {
        @Override
        protected boolean isOK(Throwable th) {
          return th instanceof AlreadyClosedException ||
            (th instanceof IllegalStateException && th.getMessage().contains("this writer hit an unrecoverable error"));
        }
      });
  }

  IndexWriter writer = new IndexWriter(dir, iwc);
  final Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  doc.add(newField("field", "aaa bbb ccc ddd eee fff ggg hhh iii jjj", customType));

  for(int i=0;i<6;i++)
    writer.addDocument(doc);

  dir.failOn(failure);
  failure.setDoFail();
  expectThrows(IOException.class, () -> {
    writer.addDocument(doc);
    writer.addDocument(doc);
    writer.commit();
  });

  failure.clearDoFail();
  expectThrows(AlreadyClosedException.class, () -> {
    writer.addDocument(doc);
    writer.commit();
    writer.close();
  });

  assertTrue(writer.isDeleterClosed());
  dir.close();
}

Example 17

Source File: TestIndexWriterMerging.java From lucene-solr with Apache License 2.0

4 votes

public void testForceMergeDeletes3() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(
      dir,
      newIndexWriterConfig(new MockAnalyzer(random()))
          .setMaxBufferedDocs(2)
          .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
          .setMergePolicy(newLogMergePolicy(50))
  );

  FieldType customType = new FieldType();
  customType.setStored(true);

  FieldType customType1 = new FieldType(TextField.TYPE_NOT_STORED);
  customType1.setTokenized(false);
  customType1.setStoreTermVectors(true);
  customType1.setStoreTermVectorPositions(true);
  customType1.setStoreTermVectorOffsets(true);
  
  Document document = new Document();
  Field storedField = newField("stored", "stored", customType);
  document.add(storedField);
  Field termVectorField = newField("termVector", "termVector", customType1);
  document.add(termVectorField);
  Field idField = newStringField("id", "", Field.Store.NO);
  document.add(idField);
  for(int i=0;i<98;i++) {
    idField.setStringValue("" + i);
    writer.addDocument(document);
  }
  writer.close();

  IndexReader ir = DirectoryReader.open(dir);
  assertEquals(98, ir.maxDoc());
  assertEquals(98, ir.numDocs());
  ir.close();
  
  IndexWriterConfig dontMergeConfig = new IndexWriterConfig(new MockAnalyzer(random()))
    .setMergePolicy(NoMergePolicy.INSTANCE);
  writer = new IndexWriter(dir, dontMergeConfig);
  for(int i=0;i<98;i+=2) {
    writer.deleteDocuments(new Term("id", "" + i));
  }
  writer.close();
  ir = DirectoryReader.open(dir);
  assertEquals(49, ir.numDocs());
  ir.close();

  writer = new IndexWriter(
      dir,
      newIndexWriterConfig(new MockAnalyzer(random()))
         .setMergePolicy(newLogMergePolicy(3))
  );
  writer.forceMergeDeletes(false);
  writer.close();
  ir = DirectoryReader.open(dir);
  assertEquals(49, ir.maxDoc());
  assertEquals(49, ir.numDocs());
  ir.close();
  dir.close();
}

Example 18

Source File: TokenSourcesTest.java From lucene-solr with Apache License 2.0

4 votes

public void testPayloads() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(true);
  myFieldType.setStoreTermVectorPayloads(true);

  curOffset = 0;

  Token[] tokens = new Token[] {
    getToken("foxes"),
    getToken("can"),
    getToken("jump"),
    getToken("high")
  };

  Document doc = new Document();
  doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
  PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

  ts.reset();
  for(Token token : tokens) {
    assertTrue(ts.incrementToken());
    assertEquals(token.toString(), termAtt.toString());
    assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
    assertEquals(token.getPayload(), payloadAtt.getPayload());
    assertEquals(token.startOffset(), offsetAtt.startOffset());
    assertEquals(token.endOffset(), offsetAtt.endOffset());
  }

  assertFalse(ts.incrementToken());

  reader.close();
  dir.close();
}

Example 19

Source File: FastVectorHighlighterTest.java From lucene-solr with Apache License 2.0

4 votes

public void testMultiValuedSortByScore() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer( random() ) ) );
  Document doc = new Document();
  FieldType type = new FieldType( TextField.TYPE_STORED );
  type.setStoreTermVectorOffsets( true );
  type.setStoreTermVectorPositions( true );
  type.setStoreTermVectors( true );
  type.freeze();
  doc.add( new Field( "field", "zero if naught", type ) ); // The first two fields contain the best match
  doc.add( new Field( "field", "hero of legend", type ) ); // but total a lower score (3) than the bottom
  doc.add( new Field( "field", "naught of hero", type ) ); // two fields (4)
  doc.add( new Field( "field", "naught of hero", type ) );
  writer.addDocument(doc);

  FastVectorHighlighter highlighter = new FastVectorHighlighter();
  
  ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();    
  fragmentsBuilder.setDiscreteMultiValueHighlighting( true );
  IndexReader reader = DirectoryReader.open(writer);
  String[] preTags = new String[] { "<b>" };
  String[] postTags = new String[] { "</b>" };
  Encoder encoder = new DefaultEncoder();
  int docId = 0;
  BooleanQuery.Builder query = new BooleanQuery.Builder();
  query.add( clause( "field", "hero" ), Occur.SHOULD);
  query.add( clause( "field", "of" ), Occur.SHOULD);
  query.add( clause( "field", "legend" ), Occur.SHOULD);
  FieldQuery fieldQuery = highlighter.getFieldQuery( query.build(), reader );

  for ( FragListBuilder fragListBuilder : new FragListBuilder[] {
    new SimpleFragListBuilder(), new WeightedFragListBuilder() } ) {
    String[] bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 20, 1,
        fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
    assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]);
    bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 28, 1,
        fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
    assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]);
    bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 30000, 1,
        fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
    assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]);
  }

  reader.close();
  writer.close();
  dir.close();
}

Example 20

Source File: FastVectorHighlighterTest.java From lucene-solr with Apache License 2.0

4 votes

public void testCommonTermsQueryHighlight() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(dir,
      newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
      .setMergePolicy(newLogMergePolicy())); // don't reorder doc ids
  FieldType type = new FieldType(TextField.TYPE_STORED);
  type.setStoreTermVectorOffsets(true);
  type.setStoreTermVectorPositions(true);
  type.setStoreTermVectors(true);
  type.freeze();
  String[] texts = {
      "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
      "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
      "JFK has been shot", "John Kennedy has been shot",
      "This text has a typo in referring to Keneddy",
      "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
  for (int i = 0; i < texts.length; i++) {
    Document doc = new Document();
    Field field = new Field("field", texts[i], type);
    doc.add(field);
    writer.addDocument(doc);
  }
  CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 2);
  query.add(new Term("field", "text"));
  query.add(new Term("field", "long"));
  query.add(new Term("field", "very"));
 
  FastVectorHighlighter highlighter = new FastVectorHighlighter();
  IndexReader reader = DirectoryReader.open(writer);
  IndexSearcher searcher = newSearcher(reader);
  TopDocs hits = searcher.search(query, 10);
  assertEquals(2, hits.totalHits.value);
  FieldQuery fieldQuery  = highlighter.getFieldQuery(query, reader);
  String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, 1, "field", 1000, 1);
  assertEquals("This piece of <b>text</b> refers to Kennedy at the beginning then has a longer piece of <b>text</b> that is <b>very</b> <b>long</b> in the middle and finally ends with another reference to Kennedy", bestFragments[0]);

  fieldQuery  = highlighter.getFieldQuery(query, reader);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, 0, "field", 1000, 1);
  assertEquals("Hello this is a piece of <b>text</b> that is <b>very</b> <b>long</b> and contains too much preamble and the meat is really here which says kennedy has been shot", bestFragments[0]);

  reader.close();
  writer.close();
  dir.close();
}