Java Code Examples for org.apache.lucene.document.FieldType#setIndexOptions()

The following examples show how to use org.apache.lucene.document.FieldType#setIndexOptions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestCustomTermFreq.java From lucene-solr with Apache License 2.0

6 votes

public void testInvalidDocsOnly() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));

  Document doc = new Document();
  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
  fieldType.setIndexOptions(IndexOptions.DOCS);
  Field field = new Field("field",
                          new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
                                              new int[] {42, 128, 17, 100}),
                          fieldType);
  doc.add(field);
  Exception e = expectThrows(IllegalStateException.class, () -> {w.addDocument(doc);});
  assertEquals("field \"field\": must index term freq while using custom TermFrequencyAttribute", e.getMessage());
  IOUtils.close(w, dir);
}

Example 2

Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0

6 votes

public void testLegalbutVeryLargeOffsets() throws Exception {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500);
  if (random().nextBoolean()) {
    t1.setPayload(new BytesRef("test"));
  }
  Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE);
  TokenStream tokenStream = new CannedTokenStream(
      new Token[] { t1, t2 }
  );
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  // store some term vectors for the checkindex cross-check
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  ft.setStoreTermVectorOffsets(true);
  Field field = new Field("foo", tokenStream, ft);
  doc.add(field);
  iw.addDocument(doc);
  iw.close();
  dir.close();
}

Example 3

Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0

6 votes

private void checkTokens(Token[] field1, Token[] field2) throws IOException {
  Directory dir = newDirectory();
  RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
  boolean success = false;
  try {
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    // store some term vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
   
    Document doc = new Document();
    doc.add(new Field("body", new CannedTokenStream(field1), ft));
    doc.add(new Field("body", new CannedTokenStream(field2), ft));
    riw.addDocument(doc);
    riw.close();
    success = true;
  } finally {
    if (success) {
      IOUtils.close(dir);
    } else {
      IOUtils.closeWhileHandlingException(riw, dir);
    }
  }
}

Example 4

Source File: FilePositionDoc.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

6 votes

public static Document Document(File f)
     throws java.io.FileNotFoundException {
  Document doc = new Document();
  doc.add(new StoredField("path", f.getPath()));
  doc.add(new StoredField("modified",
                    DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE)));
  
  //create new FieldType to store term positions (TextField is not sufficiently configurable)
  FieldType ft = new FieldType();
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  ft.setTokenized(true);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  Field contentsField = new Field("contents", new FileReader(f), ft);

  doc.add(contentsField);
  return doc;
}

Example 5

Source File: TestCustomTermFreq.java From lucene-solr with Apache License 2.0

6 votes

public void testInvalidTermVectorPositions() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())));

  Document doc = new Document();
  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
  fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  fieldType.setStoreTermVectors(true);
  fieldType.setStoreTermVectorPositions(true);
  Field field = new Field("field",
                          new CannedTermFreqs(new String[] {"foo", "bar", "foo", "bar"},
                                              new int[] {42, 128, 17, 100}),
                          fieldType);
  doc.add(field);
  Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);});
  assertEquals("field \"field\": cannot index term vector positions while using custom TermFrequencyAttribute", e.getMessage());
  IOUtils.close(w, dir);
}

Example 6

Source File: FilePositionDoc.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

6 votes

public static Document Document(String inLine, int lineNumber) {
	  	
		Document doc = new Document();
	    doc.add(new StoredField("line_number", ""+lineNumber));
	    doc.add(new StoredField("modified",
	                      DateTools.timeToString(System.currentTimeMillis(), DateTools.Resolution.MINUTE)));
	    
	    //create new FieldType to store term positions (TextField is not sufficiently configurable)
	    FieldType ft = new FieldType();
	    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
	    ft.setTokenized(true);
	    ft.setStoreTermVectors(true);
	    ft.setStoreTermVectorPositions(true);
	    Field contentsField = new Field("contents", inLine, ft);

	    doc.add(contentsField);
	    return doc;
}

Example 7

Source File: BackwardsTermQueryTest.java From lucene-query-example with Apache License 2.0

5 votes

Field newField(String name, String value, Store stored) {
	FieldType tagsFieldType = new FieldType();
	tagsFieldType.setStored(stored == Store.YES);
	IndexOptions opts = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
	tagsFieldType.setIndexOptions(opts);
	return new Field(name, value, tagsFieldType);
}

Example 8

Source File: SolrDocumentFetcher.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
  Predicate<String> readAsBytes = ResultContext.READASBYTES.get();
  if (readAsBytes != null && readAsBytes.test(fieldInfo.name)) {
    final FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(fieldInfo.hasVectors());
    ft.setOmitNorms(fieldInfo.omitsNorms());
    ft.setIndexOptions(fieldInfo.getIndexOptions());
    Objects.requireNonNull(value, "String value should not be null");
    doc.add(new StoredField(fieldInfo.name, value, ft));
  } else {
    super.stringField(fieldInfo, value);
  }

}

Example 9

Source File: LuceneIndexer.java From ontopia with Apache License 2.0

5 votes

protected FieldType getFieldType(FieldIF field) {
  FieldType type = new FieldType();
  type.setStored(field.isStored());
  type.setTokenized(field.isTokenized());
  type.setIndexOptions(field.isIndexed() ? IndexOptions.DOCS_AND_FREQS : IndexOptions.NONE);
  return type;
}

Example 10

Source File: SpellChecker.java From lucene-solr with Apache License 2.0

5 votes

private static void addGram(String text, Document doc, int ng1, int ng2) {
  int len = text.length();
  for (int ng = ng1; ng <= ng2; ng++) {
    String key = "gram" + ng;
    String end = null;
    for (int i = 0; i < len - ng + 1; i++) {
      String gram = text.substring(i, i + ng);
      FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
      Field ngramField = new Field(key, gram, ft);
      // spellchecker does not use positional queries, but we want freqs
      // for scoring these multivalued n-gram fields.
      doc.add(ngramField);
      if (i == 0) {
        // only one term possible in the startXXField, TF/pos and norms aren't needed.
        Field startField = new StringField("start" + ng, gram, Field.Store.NO);
        doc.add(startField);
      }
      end = gram;
    }
    if (end != null) { // may not be present if len==ng1
      // only one term possible in the endXXField, TF/pos and norms aren't needed.
      Field endField = new StringField("end" + ng, end, Field.Store.NO);
      doc.add(endField);
    }
  }
}

Example 11

Source File: StringIndexConverter.java From jstarcraft-core with Apache License 2.0

5 votes

@Override
public Iterable<IndexableField> convert(LuceneContext context, String path, Field field, LuceneIndex annotation, Type type, Object data) {
    Collection<IndexableField> indexables = new LinkedList<>();
    FieldType configuration = new FieldType();
    configuration.setIndexOptions(IndexOptions.DOCS);
    if (annotation.analyze()) {
        configuration.setTokenized(true);

        LuceneTerm negative = annotation.negative();
        if (negative.offset()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else if (negative.position()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        } else if (negative.frequency()) {
            configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        LuceneTerm positive = annotation.positive();
        if (positive.offset()) {
            configuration.setStoreTermVectorOffsets(true);
        }
        if (positive.position()) {
            configuration.setStoreTermVectorPositions(true);
        }
        if (positive.frequency()) {
            configuration.setStoreTermVectors(true);
        }
    }
    indexables.add(new org.apache.lucene.document.Field(path, (String) data, configuration));
    return indexables;
}

Example 12

Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testOmitNorms() throws IOException {
  MemoryIndex mi = new MemoryIndex();
  FieldType ft = new FieldType();
  ft.setTokenized(true);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  ft.setOmitNorms(true);
  mi.addField(new Field("f1", "some text in here", ft), analyzer);
  mi.freeze();

  LeafReader leader = (LeafReader) mi.createSearcher().getIndexReader();
  NumericDocValues norms = leader.getNormValues("f1");
  assertNull(norms);
}

Example 13

Source File: TestIndexWriterExceptions.java From lucene-solr with Apache License 2.0

5 votes

@Nightly
public void testTooManyTokens() throws Exception {
  Directory dir = newDirectory();
  IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  doc.add(new Field("foo", new TokenStream() {
    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    long num = 0;
    
    @Override
    public boolean incrementToken() throws IOException {
      if (num == Integer.MAX_VALUE + 1) {
        return false;
      }
      clearAttributes();
      if (num == 0) {
        posIncAtt.setPositionIncrement(1);
      } else {
        posIncAtt.setPositionIncrement(0);
      }
      termAtt.append("a");
      num++;
      if (VERBOSE && num % 1000000 == 0) {
        System.out.println("indexed: " + num);
      }
      return true;
    }
  }, ft));

  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    iw.addDocument(doc);
  });
  assertTrue(expected.getMessage().contains("too many tokens"));

  iw.close();
  dir.close();
}

Example 14

Source File: TestIndexWriter.java From lucene-solr with Apache License 2.0

4 votes

public void testIndexStoreCombos() throws Exception {
  Directory dir = newDirectory();
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  byte[] b = new byte[50];
  for(int i=0;i<50;i++)
    b[i] = (byte) (i+77);

  Document doc = new Document();

  FieldType customType = new FieldType(StoredField.TYPE);
  customType.setTokenized(true);

  Field f = new Field("binary", b, 10, 17, customType);
  // TODO: this is evil, changing the type after creating the field:
  customType.setIndexOptions(IndexOptions.DOCS);
  final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc1field1.setReader(new StringReader("doc1field1"));
  f.setTokenStream(doc1field1);

  FieldType customType2 = new FieldType(TextField.TYPE_STORED);

  Field f2 = newField("string", "value", customType2);
  final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc1field2.setReader(new StringReader("doc1field2"));
  f2.setTokenStream(doc1field2);
  doc.add(f);
  doc.add(f2);
  w.addDocument(doc);

  // add 2 docs to test in-memory merging
  final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc2field1.setReader(new StringReader("doc2field1"));
  f.setTokenStream(doc2field1);
  final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc2field2.setReader(new StringReader("doc2field2"));
  f2.setTokenStream(doc2field2);
  w.addDocument(doc);

  // force segment flush so we can force a segment merge with doc3 later.
  w.commit();

  final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc3field1.setReader(new StringReader("doc3field1"));
  f.setTokenStream(doc3field1);
  final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  doc3field2.setReader(new StringReader("doc3field2"));
  f2.setTokenStream(doc3field2);

  w.addDocument(doc);
  w.commit();
  w.forceMerge(1);   // force segment merge.
  w.close();

  IndexReader ir = DirectoryReader.open(dir);
  Document doc2 = ir.document(0);
  IndexableField f3 = doc2.getField("binary");
  b = f3.binaryValue().bytes;
  assertTrue(b != null);
  assertEquals(17, b.length, 17);
  assertEquals(87, b[0]);

  assertTrue(ir.document(0).getField("binary").binaryValue()!=null);
  assertTrue(ir.document(1).getField("binary").binaryValue()!=null);
  assertTrue(ir.document(2).getField("binary").binaryValue()!=null);

  assertEquals("value", ir.document(0).get("string"));
  assertEquals("value", ir.document(1).get("string"));
  assertEquals("value", ir.document(2).get("string"));


  // test that the terms were indexed.
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc1field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc2field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc3field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc1field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc2field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc3field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

  ir.close();
  dir.close();

}

Example 15

Source File: BBoxStrategy.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Creates this strategy.
 * {@code fieldType} is used to customize the indexing options of the 4 number fields, and to a lesser degree the XDL
 * field too. Search requires pointValues (or legacy numerics), and relevancy requires docValues. If these features
 * aren't needed then disable them.
 */
public BBoxStrategy(SpatialContext ctx, String fieldNamePrefix, FieldType fieldType) {
  super(ctx, fieldNamePrefix);
  field_bbox = fieldNamePrefix;
  field_minX = fieldNamePrefix + SUFFIX_MINX;
  field_maxX = fieldNamePrefix + SUFFIX_MAXX;
  field_minY = fieldNamePrefix + SUFFIX_MINY;
  field_maxY = fieldNamePrefix + SUFFIX_MAXY;
  field_xdl = fieldNamePrefix + SUFFIX_XDL;

  fieldType.freeze();
  this.optionsFieldType = fieldType;

  int numQuads = 0;
  if ((this.hasStored = fieldType.stored())) {
    numQuads++;
  }
  if ((this.hasDocVals = fieldType.docValuesType() != DocValuesType.NONE)) {
    numQuads++;
  }
  if ((this.hasPointVals = fieldType.pointDimensionCount() > 0)) {
    numQuads++;
  }
  if (fieldType.indexOptions() != IndexOptions.NONE && fieldType instanceof LegacyFieldType && ((LegacyFieldType)fieldType).numericType() != null) {
    if (hasPointVals) {
      throw new IllegalArgumentException("pointValues and LegacyNumericType are mutually exclusive");
    }
    final LegacyFieldType legacyType = (LegacyFieldType) fieldType;
    if (legacyType.numericType() != LegacyNumericType.DOUBLE) {
      throw new IllegalArgumentException(getClass() + " does not support " + legacyType.numericType());
    }
    numQuads++;
    legacyNumericFieldType = new LegacyFieldType(LegacyDoubleField.TYPE_NOT_STORED);
    legacyNumericFieldType.setNumericPrecisionStep(legacyType.numericPrecisionStep());
    legacyNumericFieldType.freeze();
  } else {
    legacyNumericFieldType = null;
  }

  if (hasPointVals || legacyNumericFieldType != null) { // if we have an index...
    xdlFieldType = new FieldType(StringField.TYPE_NOT_STORED);
    xdlFieldType.setIndexOptions(IndexOptions.DOCS);
    xdlFieldType.freeze();
  } else {
    xdlFieldType = null;
  }

  this.fieldsLen = numQuads * 4 + (xdlFieldType != null ? 1 : 0);
}

Example 16

Source File: TestExceedMaxTermLength.java From lucene-solr with Apache License 2.0

4 votes

public void test() throws Exception {
  
  IndexWriter w = new IndexWriter
    (dir, newIndexWriterConfig(random(), new MockAnalyzer(random())));
  try {
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    ft.setStored(random().nextBoolean());
    ft.freeze();
    
    final Document doc = new Document();
    if (random().nextBoolean()) {
      // totally ok short field value
      doc.add(new Field(TestUtil.randomSimpleString(random(), 1, 10),
                        TestUtil.randomSimpleString(random(), 1, 10),
                        ft));
    }
    // problematic field
    final String name = TestUtil.randomSimpleString(random(), 1, 50);
    final String value = TestUtil.randomSimpleString(random(),
                                                     minTestTermLength,
                                                     maxTestTermLegnth);
    final Field f = new Field(name, value, ft);
    if (random().nextBoolean()) {
      // totally ok short field value
      doc.add(new Field(TestUtil.randomSimpleString(random(), 1, 10),
                        TestUtil.randomSimpleString(random(), 1, 10),
                        ft));
    }
    doc.add(f);
    
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
      w.addDocument(doc);
    });
    String maxLengthMsg = String.valueOf(IndexWriter.MAX_TERM_LENGTH);
    String msg = expected.getMessage();
    assertTrue("IllegalArgumentException didn't mention 'immense term': " + msg,
               msg.contains("immense term"));
    assertTrue("IllegalArgumentException didn't mention max length ("+maxLengthMsg+"): " + msg,
               msg.contains(maxLengthMsg));
    assertTrue("IllegalArgumentException didn't mention field name ("+name+"): " + msg,
               msg.contains(name));
    assertTrue("IllegalArgumentException didn't mention original message: " + msg,
               msg.contains("bytes can be at most") && msg.contains("in length; got"));
  } finally {
    w.close();
  }
}

Example 17

Source File: TestBackwardsCompatibility.java From lucene-solr with Apache License 2.0

4 votes

private void addDoc(IndexWriter writer, int id) throws IOException
{
  Document doc = new Document();
  doc.add(new TextField("content", "aaa", Field.Store.NO));
  doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
  FieldType customType2 = new FieldType(TextField.TYPE_STORED);
  customType2.setStoreTermVectors(true);
  customType2.setStoreTermVectorPositions(true);
  customType2.setStoreTermVectorOffsets(true);
  doc.add(new Field("autf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
  doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
  doc.add(new Field("content2", "here is more content with aaa aaa aaa", customType2));
  doc.add(new Field("fie\u2C77ld", "field with non-ascii name", customType2));

  // add docvalues fields
  doc.add(new NumericDocValuesField("dvByte", (byte) id));
  byte bytes[] = new byte[] {
    (byte)(id >>> 24), (byte)(id >>> 16),(byte)(id >>> 8),(byte)id
  };
  BytesRef ref = new BytesRef(bytes);
  doc.add(new BinaryDocValuesField("dvBytesDerefFixed", ref));
  doc.add(new BinaryDocValuesField("dvBytesDerefVar", ref));
  doc.add(new SortedDocValuesField("dvBytesSortedFixed", ref));
  doc.add(new SortedDocValuesField("dvBytesSortedVar", ref));
  doc.add(new BinaryDocValuesField("dvBytesStraightFixed", ref));
  doc.add(new BinaryDocValuesField("dvBytesStraightVar", ref));
  doc.add(new DoubleDocValuesField("dvDouble", (double)id));
  doc.add(new FloatDocValuesField("dvFloat", (float)id));
  doc.add(new NumericDocValuesField("dvInt", id));
  doc.add(new NumericDocValuesField("dvLong", id));
  doc.add(new NumericDocValuesField("dvPacked", id));
  doc.add(new NumericDocValuesField("dvShort", (short)id));
  doc.add(new SortedSetDocValuesField("dvSortedSet", ref));
  doc.add(new SortedNumericDocValuesField("dvSortedNumeric", id));

  doc.add(new IntPoint("intPoint1d", id));
  doc.add(new IntPoint("intPoint2d", id, 2*id));
  doc.add(new FloatPoint("floatPoint1d", (float) id));
  doc.add(new FloatPoint("floatPoint2d", (float) id, (float) 2*id));
  doc.add(new LongPoint("longPoint1d", id));
  doc.add(new LongPoint("longPoint2d", id, 2*id));
  doc.add(new DoublePoint("doublePoint1d", (double) id));
  doc.add(new DoublePoint("doublePoint2d", (double) id, (double) 2*id));
  doc.add(new BinaryPoint("binaryPoint1d", bytes));
  doc.add(new BinaryPoint("binaryPoint2d", bytes, bytes));
  
  // a field with both offsets and term vectors for a cross-check
  FieldType customType3 = new FieldType(TextField.TYPE_STORED);
  customType3.setStoreTermVectors(true);
  customType3.setStoreTermVectorPositions(true);
  customType3.setStoreTermVectorOffsets(true);
  customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  doc.add(new Field("content5", "here is more content with aaa aaa aaa", customType3));
  // a field that omits only positions
  FieldType customType4 = new FieldType(TextField.TYPE_STORED);
  customType4.setStoreTermVectors(true);
  customType4.setStoreTermVectorPositions(false);
  customType4.setStoreTermVectorOffsets(true);
  customType4.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  doc.add(new Field("content6", "here is more content with aaa aaa aaa", customType4));
  // TODO: 
  //   index different norms types via similarity (we use a random one currently?!)
  //   remove any analyzer randomness, explicitly add payloads for certain fields.
  writer.addDocument(doc);
}

Example 18

Source File: TestOmitPositions.java From lucene-solr with Apache License 2.0

4 votes

public void testPositions() throws Exception {
  Directory ram = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random());
  IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(analyzer));
  Document d = new Document();
      
  // f1: docs only
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setIndexOptions(IndexOptions.DOCS);
  
  Field f1 = newField("f1", "This field has docs only", ft);
  d.add(f1);

  FieldType ft2 = new FieldType(TextField.TYPE_NOT_STORED);
  ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
  
  // f2: docs and freqs
  Field f2 = newField("f2", "This field has docs and freqs", ft2);
  d.add(f2);
  
  FieldType ft3 = new FieldType(TextField.TYPE_NOT_STORED);
  ft3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
  
  // f3: docs/freqs/positions
  Field f3 = newField("f3", "This field has docs and freqs and positions", ft3);
  d.add(f3);
      
  writer.addDocument(d);
  writer.forceMerge(1);
  // flush
  writer.close();

  LeafReader reader = getOnlyLeafReader(DirectoryReader.open(ram));
  FieldInfos fi = reader.getFieldInfos();
  // docs + docs = docs
  assertEquals(IndexOptions.DOCS, fi.fieldInfo("f1").getIndexOptions());
  // docs/freqs + docs/freqs = docs/freqs
  assertEquals(IndexOptions.DOCS_AND_FREQS, fi.fieldInfo("f2").getIndexOptions());
  // docs/freqs/pos + docs/freqs/pos = docs/freqs/pos
  assertEquals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, fi.fieldInfo("f3").getIndexOptions());
  
  reader.close();
  ram.close();
}

Example 19

Source File: Test2BTerms.java From lucene-solr with Apache License 2.0

4 votes

public void test2BTerms() throws IOException {

    System.out.println("Starting Test2B");
    final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;

    final int TERMS_PER_DOC = TestUtil.nextInt(random(), 100000, 1000000);

    List<BytesRef> savedTerms = null;

    BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BTerms"));
    //MockDirectoryWrapper dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
    if (dir instanceof MockDirectoryWrapper) {
      ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
    }
    dir.setCheckIndexOnClose(false); // don't double-checkindex

    if (true) {

      IndexWriter w = new IndexWriter(dir,
                                      new IndexWriterConfig(new MockAnalyzer(random()))
                                      .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                                      .setRAMBufferSizeMB(256.0)
                                      .setMergeScheduler(new ConcurrentMergeScheduler())
                                      .setMergePolicy(newLogMergePolicy(false, 10))
                                      .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
                                      .setCodec(TestUtil.getDefaultCodec()));

      MergePolicy mp = w.getConfig().getMergePolicy();
      if (mp instanceof LogByteSizeMergePolicy) {
        // 1 petabyte:
        ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
      }

      Document doc = new Document();
      final MyTokenStream ts = new MyTokenStream(random(), TERMS_PER_DOC);

      FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
      customType.setIndexOptions(IndexOptions.DOCS);
      customType.setOmitNorms(true);
      Field field = new Field("field", ts, customType);
      doc.add(field);
      //w.setInfoStream(System.out);
      final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);

      System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC);
      System.out.println("numDocs=" + numDocs);

      for(int i=0;i<numDocs;i++) {
        final long t0 = System.currentTimeMillis();
        w.addDocument(doc);
        System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
      }
      savedTerms = ts.savedTerms;

      System.out.println("TEST: full merge");
      w.forceMerge(1);
      System.out.println("TEST: close writer");
      w.close();
    }

    System.out.println("TEST: open reader");
    final IndexReader r = DirectoryReader.open(dir);
    if (savedTerms == null) {
      savedTerms = findTerms(r);
    }
    final int numSavedTerms = savedTerms.size();
    final List<BytesRef> bigOrdTerms = new ArrayList<>(savedTerms.subList(numSavedTerms-10, numSavedTerms));
    System.out.println("TEST: test big ord terms...");
    testSavedTerms(r, bigOrdTerms);
    System.out.println("TEST: test all saved terms...");
    testSavedTerms(r, savedTerms);
    r.close();

    System.out.println("TEST: now CheckIndex...");
    CheckIndex.Status status = TestUtil.checkIndex(dir);
    final long tc = status.segmentInfos.get(0).termIndexStatus.termCount;
    assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);

    dir.close();
    System.out.println("TEST: done!");
  }

Example 20

Source File: Index.java From dacapobench with Apache License 2.0

4 votes

/**
 * Index either a file or a directory tree.
 * 
 * @param writer
 * @param file
 * @throws IOException
 */
void indexDocs(IndexWriter writer, File file) throws IOException {

  /* Strip the absolute part of the path name from file name output */
  int scratchP = scratch.getCanonicalPath().length() + 1;

  /* do not try to index files that cannot be read */
  if (file.canRead()) {
    if (file.isDirectory()) {
      String[] files = file.list();
      // an IO error could occur
      if (files != null) {
        Arrays.sort(files);
        for (int i = 0; i < files.length; i++) {
          indexDocs(writer, new File(file, files[i]));
        }
      }
    } else {
      System.out.println("adding " + file.getCanonicalPath().substring(scratchP));
      try {
        Document doc = new Document();
        FieldType docFT = new FieldType();
        docFT.setTokenized (false);
        docFT.setStored (true);
        docFT.setIndexOptions (IndexOptions.DOCS);

        // Add the path of the file as a field named "path".  Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", file.getPath(), docFT));

        // Add the last modified date of the file a field named "modified".  Use
        // a field that is indexed (i.e. searchable), but don't tokenize the field
        // into words.
        doc.add(new Field("modified",
                DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE),
                docFT));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in the system's default encoding.
        // If that's not the case searching for special characters will fail.
        docFT.setTokenized (true);
        docFT.setStored (false);
        doc.add(new Field("contents", new FileReader(file), docFT));
        writer.addDocument(doc);
      }
      // at least on windows, some temporary files raise this exception with
      // an "access denied" message
      // checking if the file can be read doesn't help
      catch (FileNotFoundException fnfe) { }
    }
  }
}