Java Code Examples for org.apache.lucene.document.FieldType#setStoreTermVectors()

The following examples show how to use org.apache.lucene.document.FieldType#setStoreTermVectors() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestFixBrokenOffsetsFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBogusTermVectors() throws IOException {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  Field field = new Field("foo", "", ft);
  field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(
      new Token("bar", 5, 10), new Token("bar", 1, 4)
      )));
  doc.add(field);
  iw.addDocument(doc);
  iw.close();
  dir.close();
}
 
Example 2
Source File: TestCustomTermFreq.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testInvalidTermVectorPositions() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));

  Document doc = new Document();
  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
  fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  fieldType.setStoreTermVectors(true);
  fieldType.setStoreTermVectorPositions(true);
  Field field = new Field("field",
                          new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
                                              new int[] {42, 128, 17, 100}),
                          fieldType);
  doc.add(field);
  Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
  assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
  IOUtils.close(w, dir);
}
 
Example 3
Source File: TestIndexWriterMerging.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testSetMaxMergeDocs() throws IOException {
  Directory dir = newDirectory();
  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()))
    .setMergeScheduler(new MyMergeScheduler())
    .setMaxBufferedDocs(2)
    .setMergePolicy(newLogMergePolicy());
  LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
  lmp.setMaxMergeDocs(20);
  lmp.setMergeFactor(2);
  IndexWriter iw = new IndexWriter(dir, conf);
  Document document = new Document();

  FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  
  document.add(newField("tvtest", "a b c", customType));
  for(int i=0;i<177;i++)
    iw.addDocument(document);
  iw.close();
  dir.close();
}
 
Example 4
Source File: TestDirectoryReader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
static void addDocumentWithTermVectorFields(IndexWriter writer) throws IOException
{
    Document doc = new Document();
    FieldType customType5 = new FieldType(TextField.TYPE_STORED);
    customType5.setStoreTermVectors(true);
    FieldType customType6 = new FieldType(TextField.TYPE_STORED);
    customType6.setStoreTermVectors(true);
    customType6.setStoreTermVectorOffsets(true);
    FieldType customType7 = new FieldType(TextField.TYPE_STORED);
    customType7.setStoreTermVectors(true);
    customType7.setStoreTermVectorPositions(true);
    FieldType customType8 = new FieldType(TextField.TYPE_STORED);
    customType8.setStoreTermVectors(true);
    customType8.setStoreTermVectorOffsets(true);
    customType8.setStoreTermVectorPositions(true);
    doc.add(newTextField("tvnot", "tvnot", Field.Store.YES));
    doc.add(newField("termvector","termvector",customType5));
    doc.add(newField("tvoffset","tvoffset", customType6));
    doc.add(newField("tvposition","tvposition", customType7));
    doc.add(newField("tvpositionoffset","tvpositionoffset", customType8));
    
    writer.addDocument(doc);
}
 
Example 5
Source File: TestPostingsOffsets.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testLegalbutVeryLargeOffsets() throws Exception {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500);
  if (random().nextBoolean()) {
    t1.setPayload(new BytesRef("test"));
  }
  Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE);
  TokenStream tokenStream = new CannedTokenStream(
      new Token[] { t1, t2 }
  );
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  // store some term vectors for the checkindex cross-check
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  ft.setStoreTermVectorOffsets(true);
  Field field = new Field("foo", tokenStream, ft);
  doc.add(field);
  iw.addDocument(doc);
  iw.close();
  dir.close();
}
 
Example 6
Source File: TestTermVectorsWriter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testNoAbortOnBadTVSettings() throws Exception {
  Directory dir = newDirectory();
  // Don't use RandomIndexWriter because we want to be sure both docs go to 1 seg:
  IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
  IndexWriter iw = new IndexWriter(dir, iwc);

  Document doc = new Document();
  iw.addDocument(doc);
  FieldType ft = new FieldType(StoredField.TYPE);
  ft.setStoreTermVectors(true);
  ft.freeze();
  doc.add(new Field("field", "value", ft));

  expectThrows(IllegalArgumentException.class, () -> {
    iw.addDocument(doc);
  });

  IndexReader r = DirectoryReader.open(iw);

  // Make sure the exc didn't lose our first document:
  assertEquals(1, r.numDocs());
  iw.close();
  r.close();
  dir.close();
}
 
Example 7
Source File: TestPostingsOffsets.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testAddFieldTwice() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  FieldType customType3 = new FieldType(TextField.TYPE_STORED);
  customType3.setStoreTermVectors(true);
  customType3.setStoreTermVectorPositions(true);
  customType3.setStoreTermVectorOffsets(true);    
  customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
  doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
  iw.addDocument(doc);
  iw.close();
  dir.close(); // checkindex
}
 
Example 8
Source File: TestTermVectorsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffsetPositionStandardEmptyField() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  Field f = newField("field", "", customType);
  Field f2 = newField("field", "crunch man", customType);
  doc.add(f);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator();
  assertNotNull(termsEnum.next());
  PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);

  assertEquals(1, (int) termsEnum.totalTermFreq());
  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(1, dpEnum.startOffset());
  assertEquals(7, dpEnum.endOffset());

  assertNotNull(termsEnum.next());
  dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(11, dpEnum.endOffset());

  r.close();
  dir.close();
}
 
Example 9
Source File: TermVectorsAdapterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected void createIndex() throws IOException {
  indexDir = createTempDir("testIndex");

  Directory dir = newFSDirectory(indexDir);
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new StandardAnalyzer());

  FieldType textType = new FieldType();
  textType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  textType.setTokenized(true);
  textType.setStoreTermVectors(true);

  FieldType textType_pos = new FieldType();
  textType_pos.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  textType_pos.setTokenized(true);
  textType_pos.setStoreTermVectors(true);
  textType_pos.setStoreTermVectorPositions(true);

  FieldType textType_pos_offset = new FieldType();
  textType_pos_offset.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  textType_pos_offset.setTokenized(true);
  textType_pos_offset.setStoreTermVectors(true);
  textType_pos_offset.setStoreTermVectorPositions(true);
  textType_pos_offset.setStoreTermVectorOffsets(true);

  String text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.";
  Document doc = new Document();
  doc.add(newField("text1", text, textType));
  doc.add(newField("text2", text, textType_pos));
  doc.add(newField("text3", text, textType_pos_offset));
  writer.addDocument(doc);

  writer.commit();
  writer.close();
  dir.close();
}
 
Example 10
Source File: LuceneIndexer.java    From MtgDesktopCompanion with GNU General Public License v3.0 5 votes vote down vote up
private Document toDocuments(MagicCard mc) {
       Document doc = new Document();
       			
       		FieldType fieldType = new FieldType();
         		fieldType.setStored(true);
         		fieldType.setStoreTermVectors(true);
         		fieldType.setTokenized(true);
         		fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
         		
        		   doc.add(new Field("name", mc.getName(), fieldType));
        		   
        		   if(mc.getCost()!=null)
        			   doc.add(new Field("cost", mc.getCost(),fieldType));
        		   else
        			   doc.add(new Field("cost", "",fieldType));
        		  
        		   if(mc.getText()!=null)
        			   doc.add(new Field("text", mc.getText(), fieldType));
        		   else
        			   doc.add(new Field("text", "", fieldType));
        		   
        		   doc.add(new Field("type", mc.getFullType(), fieldType));
        		   doc.add(new Field("set",mc.getCurrentSet().getId(),fieldType));
        		   doc.add(new StoredField("cmc",mc.getCmc()));
        		   doc.add(new StringField("data",serializer.toJson(mc),Field.Store.YES));
        		   
         	   for(MTGColor color:mc.getColors())
         	   {
         		   doc.add(new Field("color", color.getCode(), fieldType));
         	   }
         	 
         	   
    		   
      return doc;
}
 
Example 11
Source File: FastVectorHighlighterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithSynonym() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  FieldType type = new FieldType(TextField.TYPE_STORED);
  type.setStoreTermVectorOffsets(true);
  type.setStoreTermVectorPositions(true);
  type.setStoreTermVectors(true);
  type.freeze();

  Document doc = new Document();
  doc.add( new Field("field", "the quick brown fox", type ));
  writer.addDocument(doc);
  FastVectorHighlighter highlighter = new FastVectorHighlighter();

  IndexReader reader = DirectoryReader.open(writer);
  int docId = 0;

  // query1: simple synonym query
  SynonymQuery synQuery = new SynonymQuery.Builder("field")
      .addTerm(new Term("field", "quick"))
      .addTerm(new Term("field", "fast"))
      .build();
  FieldQuery fieldQuery  = highlighter.getFieldQuery(synQuery, reader);
  String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  assertEquals("the <b>quick</b> brown fox", bestFragments[0]);

  // query2: boolean query with synonym query
  BooleanQuery.Builder bq =
      new BooleanQuery.Builder()
          .add(new BooleanClause(synQuery, Occur.MUST))
          .add(new BooleanClause(new TermQuery(new Term("field", "fox")), Occur.MUST));
  fieldQuery  = highlighter.getFieldQuery(bq.build(), reader);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  assertEquals("the <b>quick</b> brown <b>fox</b>", bestFragments[0]);

  reader.close();
  writer.close();
  dir.close();
}
 
Example 12
Source File: TestTermVectors.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {                  
  directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)).setMergePolicy(newLogMergePolicy()));
  //writer.setNoCFSRatio(1.0);
  //writer.infoStream = System.out;
  for (int i = 0; i < 1000; i++) {
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_STORED);
    int mod3 = i % 3;
    int mod2 = i % 2;
    if (mod2 == 0 && mod3 == 0) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorOffsets(true);
      ft.setStoreTermVectorPositions(true);
    } else if (mod2 == 0) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorPositions(true);
    } else if (mod3 == 0) {
      ft.setStoreTermVectors(true);
      ft.setStoreTermVectorOffsets(true);
    } else {
      ft.setStoreTermVectors(true);
    }
    doc.add(new Field("field", English.intToEnglish(i), ft));
    //test no term vectors too
    doc.add(new TextField("noTV", English.intToEnglish(i), Field.Store.YES));
    writer.addDocument(doc);
  }
  reader = writer.getReader();
  writer.close();
}
 
Example 13
Source File: TestBackwardsCompatibility.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void addDoc(IndexWriter writer, int id) throws IOException
{
  Document doc = new Document();
  doc.add(new TextField("content", "aaa", Field.Store.NO));
  doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
  FieldType customType2 = new FieldType(TextField.TYPE_STORED);
  customType2.setStoreTermVectors(true);
  customType2.setStoreTermVectorPositions(true);
  customType2.setStoreTermVectorOffsets(true);
  doc.add(new Field("autf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
  doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
  doc.add(new Field("content2", "here is more content with aaa aaa aaa", customType2));
  doc.add(new Field("fie\u2C77ld", "field with non-ascii name", customType2));

  // add docvalues fields
  doc.add(new NumericDocValuesField("dvByte", (byte) id));
  byte bytes[] = new byte[] {
    (byte)(id >>> 24), (byte)(id >>> 16),(byte)(id >>> 8),(byte)id
  };
  BytesRef ref = new BytesRef(bytes);
  doc.add(new BinaryDocValuesField("dvBytesDerefFixed", ref));
  doc.add(new BinaryDocValuesField("dvBytesDerefVar", ref));
  doc.add(new SortedDocValuesField("dvBytesSortedFixed", ref));
  doc.add(new SortedDocValuesField("dvBytesSortedVar", ref));
  doc.add(new BinaryDocValuesField("dvBytesStraightFixed", ref));
  doc.add(new BinaryDocValuesField("dvBytesStraightVar", ref));
  doc.add(new DoubleDocValuesField("dvDouble", (double)id));
  doc.add(new FloatDocValuesField("dvFloat", (float)id));
  doc.add(new NumericDocValuesField("dvInt", id));
  doc.add(new NumericDocValuesField("dvLong", id));
  doc.add(new NumericDocValuesField("dvPacked", id));
  doc.add(new NumericDocValuesField("dvShort", (short)id));
  doc.add(new SortedSetDocValuesField("dvSortedSet", ref));
  doc.add(new SortedNumericDocValuesField("dvSortedNumeric", id));

  doc.add(new IntPoint("intPoint1d", id));
  doc.add(new IntPoint("intPoint2d", id, 2*id));
  doc.add(new FloatPoint("floatPoint1d", (float) id));
  doc.add(new FloatPoint("floatPoint2d", (float) id, (float) 2*id));
  doc.add(new LongPoint("longPoint1d", id));
  doc.add(new LongPoint("longPoint2d", id, 2*id));
  doc.add(new DoublePoint("doublePoint1d", (double) id));
  doc.add(new DoublePoint("doublePoint2d", (double) id, (double) 2*id));
  doc.add(new BinaryPoint("binaryPoint1d", bytes));
  doc.add(new BinaryPoint("binaryPoint2d", bytes, bytes));
  
  // a field with both offsets and term vectors for a cross-check
  FieldType customType3 = new FieldType(TextField.TYPE_STORED);
  customType3.setStoreTermVectors(true);
  customType3.setStoreTermVectorPositions(true);
  customType3.setStoreTermVectorOffsets(true);
  customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  doc.add(new Field("content5", "here is more content with aaa aaa aaa", customType3));
  // a field that omits only positions
  FieldType customType4 = new FieldType(TextField.TYPE_STORED);
  customType4.setStoreTermVectors(true);
  customType4.setStoreTermVectorPositions(false);
  customType4.setStoreTermVectorOffsets(true);
  customType4.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  doc.add(new Field("content6", "here is more content with aaa aaa aaa", customType4));
  // TODO: 
  //   index different norms types via similarity (we use a random one currently?!)
  //   remove any analyzer randomness, explicitly add payloads for certain fields.
  writer.addDocument(doc);
}
 
Example 14
Source File: TestMemoryIndexAgainstDirectory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testDuelMemoryIndexCoreDirectoryWithArrayField() throws Exception {

    final String field_name = "text";
    MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
    if (random().nextBoolean()) {
      mockAnalyzer.setOffsetGap(random().nextInt(100));
    }
    //index into a random directory
    FieldType type = new FieldType(TextField.TYPE_STORED);
    type.setStoreTermVectorOffsets(true);
    type.setStoreTermVectorPayloads(false);
    type.setStoreTermVectorPositions(true);
    type.setStoreTermVectors(true);
    type.freeze();

    Document doc = new Document();
    doc.add(new Field(field_name, "la la", type));
    doc.add(new Field(field_name, "foo bar foo bar foo", type));

    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
    writer.updateDocument(new Term("id", "1"), doc);
    writer.commit();
    writer.close();
    DirectoryReader reader = DirectoryReader.open(dir);

    //Index document in Memory index
    MemoryIndex memIndex = new MemoryIndex(true);
    memIndex.addField(field_name, "la la", mockAnalyzer);
    memIndex.addField(field_name, "foo bar foo bar foo", mockAnalyzer);

    //compare term vectors
    Terms ramTv = reader.getTermVector(0, field_name);
    IndexReader memIndexReader = memIndex.createSearcher().getIndexReader();
    TestUtil.checkReader(memIndexReader);
    Terms memTv = memIndexReader.getTermVector(0, field_name);

    compareTermVectors(ramTv, memTv, field_name);
    memIndexReader.close();
    reader.close();
    dir.close();

  }
 
Example 15
Source File: TestTermVectorsWriter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testDoubleOffsetCounting() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType customType = new FieldType(StringField.TYPE_NOT_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  Field f = newField("field", "abcd", customType);
  doc.add(f);
  doc.add(f);
  Field f2 = newField("field", "", customType);
  doc.add(f2);
  doc.add(f);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertNotNull(vector);
  TermsEnum termsEnum = vector.iterator();
  assertNotNull(termsEnum.next());
  assertEquals("", termsEnum.term().utf8ToString());

  // Token "" occurred once
  assertEquals(1, termsEnum.totalTermFreq());

  PostingsEnum dpEnum = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(8, dpEnum.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

  // Token "abcd" occurred three times
  assertEquals(new BytesRef("abcd"), termsEnum.next());
  dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
  assertEquals(3, termsEnum.totalTermFreq());

  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(0, dpEnum.startOffset());
  assertEquals(4, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(4, dpEnum.startOffset());
  assertEquals(8, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(12, dpEnum.endOffset());

  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
  assertNull(termsEnum.next());
  r.close();
  dir.close();
}
 
Example 16
Source File: TestIndexWriterWithThreads.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void run() {
  try {
    syncStart.await();
  } catch (BrokenBarrierException | InterruptedException e) {
    error = e;
    throw new RuntimeException(e);
  }

  final Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  
  doc.add(newField("field", "aaa bbb ccc ddd eee fff ggg hhh iii jjj", customType));
  doc.add(new NumericDocValuesField("dv", 5));

  int idUpto = 0;
  int fullCount = 0;

  do {
    try {
      writer.updateDocument(new Term("id", ""+(idUpto++)), doc);
      addCount++;
    } catch (IOException ioe) {
      if (VERBOSE) {
        System.out.println("TEST: expected exc:");
        ioe.printStackTrace(System.out);
      }
      //System.out.println(Thread.currentThread().getName() + ": hit exc");
      //ioe.printStackTrace(System.out);
      if (ioe.getMessage().startsWith("fake disk full at") ||
          ioe.getMessage().equals("now failing on purpose")) {
        diskFull = true;
        try {
          Thread.sleep(1);
        } catch (InterruptedException ie) {
          throw new ThreadInterruptedException(ie);
        }
        if (fullCount++ >= 5)
          break;
      } else {
        if (noErrors) {
          System.out.println(Thread.currentThread().getName() + ": ERROR: unexpected IOException:");
          ioe.printStackTrace(System.out);
          error = ioe;
        }
        break;
      }
    } catch (AlreadyClosedException ace) {
      // OK: abort closes the writer
      break;
    } catch (Throwable t) {
      if (noErrors) {
        System.out.println(Thread.currentThread().getName() + ": ERROR: unexpected Throwable:");
        t.printStackTrace(System.out);
        error = t;
      }
      break;
    }
  } while (true);
}
 
Example 17
Source File: TestIndexWriterWithThreads.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void _testSingleThreadFailure(MockDirectoryWrapper.Failure failure) throws IOException {
  MockDirectoryWrapper dir = newMockDirectory();

  IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
    .setMaxBufferedDocs(2)
    .setMergeScheduler(new ConcurrentMergeScheduler())
    .setCommitOnClose(false);

  if (iwc.getMergeScheduler() instanceof ConcurrentMergeScheduler) {
    iwc.setMergeScheduler(new SuppressingConcurrentMergeScheduler() {
        @Override
        protected boolean isOK(Throwable th) {
          return th instanceof AlreadyClosedException ||
            (th instanceof IllegalStateException && th.getMessage().contains("this writer hit an unrecoverable error"));
        }
      });
  }

  IndexWriter writer = new IndexWriter(dir, iwc);
  final Document doc = new Document();
  FieldType customType = new FieldType(TextField.TYPE_STORED);
  customType.setStoreTermVectors(true);
  customType.setStoreTermVectorPositions(true);
  customType.setStoreTermVectorOffsets(true);
  doc.add(newField("field", "aaa bbb ccc ddd eee fff ggg hhh iii jjj", customType));

  for(int i=0;i<6;i++)
    writer.addDocument(doc);

  dir.failOn(failure);
  failure.setDoFail();
  expectThrows(IOException.class, () -> {
    writer.addDocument(doc);
    writer.addDocument(doc);
    writer.commit();
  });

  failure.clearDoFail();
  expectThrows(AlreadyClosedException.class, () -> {
    writer.addDocument(doc);
    writer.commit();
    writer.close();
  });

  assertTrue(writer.isDeleterClosed());
  dir.close();
}
 
Example 18
Source File: FastVectorHighlighterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testBooleanPhraseWithSynonym() throws IOException {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  Document doc = new Document();
  FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
  type.setStoreTermVectorOffsets(true);
  type.setStoreTermVectorPositions(true);
  type.setStoreTermVectors(true);
  type.freeze();
  Token syn = new Token("httpwwwfacebookcom", 6, 29);
  syn.setPositionIncrement(0);
  CannedTokenStream ts = new CannedTokenStream(
      new Token("test", 0, 4),
      new Token("http", 6, 10),
      syn,
      new Token("www", 13, 16),
      new Token("facebook", 17, 25),
      new Token("com", 26, 29)
  );
  Field field = new Field("field", ts, type);
  doc.add(field);
  doc.add(new StoredField("field", "Test: http://www.facebook.com"));
  writer.addDocument(doc);
  FastVectorHighlighter highlighter = new FastVectorHighlighter();
  
  IndexReader reader = DirectoryReader.open(writer);
  int docId = 0;
  
  // query1: match
  PhraseQuery pq = new PhraseQuery("field", "test", "http", "www", "facebook", "com");
  FieldQuery fieldQuery  = highlighter.getFieldQuery(pq, reader);
  String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
  
  // query2: match
  PhraseQuery pq2 = new PhraseQuery("field", "test", "httpwwwfacebookcom", "www", "facebook", "com");
  fieldQuery  = highlighter.getFieldQuery(pq2, reader);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
  
  // query3: OR query1 and query2 together
  BooleanQuery.Builder bq = new BooleanQuery.Builder();
  bq.add(pq, BooleanClause.Occur.SHOULD);
  bq.add(pq2, BooleanClause.Occur.SHOULD);
  fieldQuery  = highlighter.getFieldQuery(bq.build(), reader);
  bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
  assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
  
  reader.close();
  writer.close();
  dir.close();
}
 
Example 19
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Repeat(iterations = 10)
//@Seed("947083AB20AB2D4F")
public void testRandomizedRoundTrip() throws Exception {
  final int distinct = TestUtil.nextInt(random(), 1, 10);

  String[] terms = new String[distinct];
  BytesRef[] termBytes = new BytesRef[distinct];
  for (int i = 0; i < distinct; ++i) {
    terms[i] = TestUtil.randomRealisticUnicodeString(random());
    termBytes[i] = new BytesRef(terms[i]);
  }

  final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
      new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
  //check to see if the token streams might have non-deterministic testable result
  final boolean storeTermVectorPositions = random().nextBoolean();
  final int[] startOffsets = rTokenStream.getStartOffsets();
  final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
  for (int i = 1; i < positionsIncrements.length; i++) {
    if (storeTermVectorPositions && positionsIncrements[i] != 0) {
      continue;
    }
    //TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
    // than previous token's endOffset?  That would increase the testable possibilities.
    if (startOffsets[i] == startOffsets[i-1]) {
      if (VERBOSE)
        System.out.println("Skipping test because can't easily validate random token-stream is correct.");
      rTokenStream.close();
      return;
    }
  }

  //sanity check itself
  assertTokenStreamContents(rTokenStream,
      rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
      rTokenStream.getPositionsIncrements());

  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
  //payloads require positions; it will throw an error otherwise
  myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());

  Document doc = new Document();
  doc.add(new Field("field", rTokenStream, myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream vectorTokenStream =
      TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  //sometimes check payloads
  PayloadAttribute payloadAttribute = null;
  if (myFieldType.storeTermVectorPayloads() && usually()) {
    payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
  }
  assertTokenStreamContents(vectorTokenStream,
      rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
      myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
  //test payloads
  if (payloadAttribute != null) {
    vectorTokenStream.reset();
    for (int i = 0; vectorTokenStream.incrementToken(); i++) {
      assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
    }
  }

  reader.close();
  dir.close();
  rTokenStream.close();
}
 
Example 20
Source File: TestPostingsOffsets.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testBasic() throws Exception {
  Directory dir = newDirectory();
  
  RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
  Document doc = new Document();

  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  if (random().nextBoolean()) {
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(random().nextBoolean());
    ft.setStoreTermVectorOffsets(random().nextBoolean());
  }
  Token[] tokens = new Token[] {
    makeToken("a", 1, 0, 6),
    makeToken("b", 1, 8, 9),
    makeToken("a", 1, 9, 17),
    makeToken("c", 1, 19, 50),
  };
  doc.add(new Field("content", new CannedTokenStream(tokens), ft));

  w.addDocument(doc);
  IndexReader r = w.getReader();
  w.close();

  PostingsEnum dp = MultiTerms.getTermPostingsEnum(r, "content", new BytesRef("a"));
  assertNotNull(dp);
  assertEquals(0, dp.nextDoc());
  assertEquals(2, dp.freq());
  assertEquals(0, dp.nextPosition());
  assertEquals(0, dp.startOffset());
  assertEquals(6, dp.endOffset());
  assertEquals(2, dp.nextPosition());
  assertEquals(9, dp.startOffset());
  assertEquals(17, dp.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

  dp = MultiTerms.getTermPostingsEnum(r, "content", new BytesRef("b"));
  assertNotNull(dp);
  assertEquals(0, dp.nextDoc());
  assertEquals(1, dp.freq());
  assertEquals(1, dp.nextPosition());
  assertEquals(8, dp.startOffset());
  assertEquals(9, dp.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

  dp = MultiTerms.getTermPostingsEnum(r, "content", new BytesRef("c"));
  assertNotNull(dp);
  assertEquals(0, dp.nextDoc());
  assertEquals(1, dp.freq());
  assertEquals(3, dp.nextPosition());
  assertEquals(19, dp.startOffset());
  assertEquals(50, dp.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());

  r.close();
  dir.close();
}