Java Code Examples for org.apache.lucene.document.FieldType#setTokenized()
The following examples show how to use
org.apache.lucene.document.FieldType#setTokenized() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestUnifiedHighlighterTermIntervals.java From lucene-solr with Apache License 2.0 | 6 votes |
private IndexReader indexSomeFields() throws IOException { RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.NONE); ft.setTokenized(false); ft.setStored(true); ft.freeze(); Field title = new Field("title", "", fieldType); Field text = new Field("text", "", fieldType); Field category = new Field("category", "", fieldType); Document doc = new Document(); doc.add(title); doc.add(text); doc.add(category); title.setStringValue("This is the title field."); text.setStringValue("This is the text field. You can put some text if you want."); category.setStringValue("This is the category field."); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); return ir; }
Example 2
Source File: PresearcherTestBase.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testNonStringTermHandling() throws IOException { FieldType ft = new FieldType(); ft.setTokenized(true); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); try (Monitor monitor = newMonitor()) { monitor.register(new MonitorQuery("1", new TermQuery(new Term("f", NON_STRING_TERM)))); Document doc = new Document(); doc.add(new Field("f", new NonStringTokenStream(), ft)); MatchingQueries<QueryMatch> m = monitor.match(doc, QueryMatch.SIMPLE_MATCHER); assertEquals(1, m.getMatchCount()); assertEquals(1, m.getQueriesRun()); } }
Example 3
Source File: FilePositionDoc.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 6 votes |
public static Document Document(String inLine, int lineNumber) { Document doc = new Document(); doc.add(new StoredField("line_number", ""+lineNumber)); doc.add(new StoredField("modified", DateTools.timeToString(System.currentTimeMillis(), DateTools.Resolution.MINUTE))); //create new FieldType to store term positions (TextField is not sufficiently configurable) FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); Field contentsField = new Field("contents", inLine, ft); doc.add(contentsField); return doc; }
Example 4
Source File: TestMultiTermConstantScore.java From lucene-solr with Apache License 2.0 | 6 votes |
@BeforeClass public static void beforeClass() throws Exception { String[] data = new String[] { "A 1 2 3 4 5 6", "Z 4 5 6", null, "B 2 4 5 6", "Y 3 5 6", null, "C 3 6", "X 4 5 6" }; small = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), small, newIndexWriterConfig( new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)).setMergePolicy(newLogMergePolicy())); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setTokenized(false); for (int i = 0; i < data.length; i++) { Document doc = new Document(); doc.add(newField("id", String.valueOf(i), customType));// Field.Keyword("id",String.valueOf(i))); doc.add(newField("all", "all", customType));// Field.Keyword("all","all")); if (null != data[i]) { doc.add(newTextField("data", data[i], Field.Store.YES));// Field.Text("data",data[i])); } writer.addDocument(doc); } reader = writer.getReader(); writer.close(); }
Example 5
Source File: StringIndexConverter.java From jstarcraft-core with Apache License 2.0 | 5 votes |
@Override public Iterable<IndexableField> convert(LuceneContext context, String path, Field field, LuceneIndex annotation, Type type, Object data) { Collection<IndexableField> indexables = new LinkedList<>(); FieldType configuration = new FieldType(); configuration.setIndexOptions(IndexOptions.DOCS); if (annotation.analyze()) { configuration.setTokenized(true); LuceneTerm negative = annotation.negative(); if (negative.offset()) { configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else if (negative.position()) { configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); } else if (negative.frequency()) { configuration.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } LuceneTerm positive = annotation.positive(); if (positive.offset()) { configuration.setStoreTermVectorOffsets(true); } if (positive.position()) { configuration.setStoreTermVectorPositions(true); } if (positive.frequency()) { configuration.setStoreTermVectors(true); } } indexables.add(new org.apache.lucene.document.Field(path, (String) data, configuration)); return indexables; }
Example 6
Source File: TestOrdValues.java From lucene-solr with Apache License 2.0 | 5 votes |
private static void addDoc(RandomIndexWriter iw, int i) throws Exception { Document d = new Document(); Field f; int scoreAndID = i + 1; FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setTokenized(false); customType.setOmitNorms(true); f = newField(ID_FIELD, id2String(scoreAndID), customType); // for debug purposes d.add(f); d.add(new SortedDocValuesField(ID_FIELD, new BytesRef(id2String(scoreAndID)))); FieldType customType2 = new FieldType(TextField.TYPE_NOT_STORED); customType2.setOmitNorms(true); f = newField(TEXT_FIELD, "text of doc" + scoreAndID + textLine(i), customType2); // for regular search d.add(f); f = new LegacyIntField(INT_FIELD, scoreAndID, Store.YES); // for function scoring d.add(f); d.add(new NumericDocValuesField(INT_FIELD, scoreAndID)); f = new LegacyFloatField(FLOAT_FIELD, scoreAndID, Store.YES); // for function scoring d.add(f); d.add(new NumericDocValuesField(FLOAT_FIELD, Float.floatToRawIntBits(scoreAndID))); iw.addDocument(d); log("added: " + d); }
Example 7
Source File: UseLucene.java From mmseg4j-solr with Apache License 2.0 | 5 votes |
private Document createDoc(int id) { Document doc = new Document(); FieldType ft = new FieldType(); ft.setTokenized(true); ft.setStored(true); ft.setIndexOptions(IndexOptions.DOCS); doc.add(new Field("id", "" + id, ft)); FieldType ft2 = new FieldType(); ft2.setTokenized(true); ft.setStored(true); ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS); doc.add(new Field("name", "echo ensh id " + id, ft2)); return doc; }
Example 8
Source File: LuceneIndexer.java From ontopia with Apache License 2.0 | 5 votes |
protected FieldType getFieldType(FieldIF field) { FieldType type = new FieldType(); type.setStored(field.isStored()); type.setTokenized(field.isTokenized()); type.setIndexOptions(field.isIndexed() ? IndexOptions.DOCS_AND_FREQS : IndexOptions.NONE); return type; }
Example 9
Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testOmitNorms() throws IOException { MemoryIndex mi = new MemoryIndex(); FieldType ft = new FieldType(); ft.setTokenized(true); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); ft.setOmitNorms(true); mi.addField(new Field("f1", "some text in here", ft), analyzer); mi.freeze(); LeafReader leader = (LeafReader) mi.createSearcher().getIndexReader(); NumericDocValues norms = leader.getNormValues("f1"); assertNull(norms); }
Example 10
Source File: IndexOptionsDialogFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
private void saveOptions() { nf.setStored(storedCB.isSelected()); if (nf.getType().equals(Field.class)) { FieldType ftype = (FieldType) nf.getFieldType(); ftype.setStored(storedCB.isSelected()); ftype.setTokenized(tokenizedCB.isSelected()); ftype.setOmitNorms(omitNormsCB.isSelected()); ftype.setIndexOptions(IndexOptions.valueOf((String) idxOptCombo.getSelectedItem())); ftype.setStoreTermVectors(storeTVCB.isSelected()); ftype.setStoreTermVectorPositions(storeTVPosCB.isSelected()); ftype.setStoreTermVectorOffsets(storeTVOffCB.isSelected()); ftype.setStoreTermVectorPayloads(storeTVPayCB.isSelected()); } dialog.dispose(); }
Example 11
Source File: TermVectorsAdapterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected void createIndex() throws IOException { indexDir = createTempDir("testIndex"); Directory dir = newFSDirectory(indexDir); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new StandardAnalyzer()); FieldType textType = new FieldType(); textType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textType.setTokenized(true); textType.setStoreTermVectors(true); FieldType textType_pos = new FieldType(); textType_pos.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textType_pos.setTokenized(true); textType_pos.setStoreTermVectors(true); textType_pos.setStoreTermVectorPositions(true); FieldType textType_pos_offset = new FieldType(); textType_pos_offset.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textType_pos_offset.setTokenized(true); textType_pos_offset.setStoreTermVectors(true); textType_pos_offset.setStoreTermVectorPositions(true); textType_pos_offset.setStoreTermVectorOffsets(true); String text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."; Document doc = new Document(); doc.add(newField("text1", text, textType)); doc.add(newField("text2", text, textType_pos)); doc.add(newField("text3", text, textType_pos_offset)); writer.addDocument(doc); writer.commit(); writer.close(); dir.close(); }
Example 12
Source File: TestPerFieldPostingsFormat2.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testStressPerFieldCodec() throws IOException { Directory dir = newDirectory(random()); final int docsPerRound = 97; int numRounds = atLeast(1); for (int i = 0; i < numRounds; i++) { int num = TestUtil.nextInt(random(), 30, 60); IndexWriterConfig config = newIndexWriterConfig(random(), new MockAnalyzer(random())); config.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = newWriter(dir, config); for (int j = 0; j < docsPerRound; j++) { final Document doc = new Document(); for (int k = 0; k < num; k++) { FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setTokenized(random().nextBoolean()); customType.setOmitNorms(random().nextBoolean()); Field field = newField("" + k, TestUtil .randomRealisticUnicodeString(random(), 128), customType); doc.add(field); } writer.addDocument(doc); } if (random().nextBoolean()) { writer.forceMerge(1); } writer.commit(); assertEquals((i + 1) * docsPerRound, writer.getDocStats().maxDoc); writer.close(); } dir.close(); }
Example 13
Source File: LuceneIndexer.java From MtgDesktopCompanion with GNU General Public License v3.0 | 5 votes |
private Document toDocuments(MagicCard mc) { Document doc = new Document(); FieldType fieldType = new FieldType(); fieldType.setStored(true); fieldType.setStoreTermVectors(true); fieldType.setTokenized(true); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); doc.add(new Field("name", mc.getName(), fieldType)); if(mc.getCost()!=null) doc.add(new Field("cost", mc.getCost(),fieldType)); else doc.add(new Field("cost", "",fieldType)); if(mc.getText()!=null) doc.add(new Field("text", mc.getText(), fieldType)); else doc.add(new Field("text", "", fieldType)); doc.add(new Field("type", mc.getFullType(), fieldType)); doc.add(new Field("set",mc.getCurrentSet().getId(),fieldType)); doc.add(new StoredField("cmc",mc.getCmc())); doc.add(new StringField("data",serializer.toJson(mc),Field.Store.YES)); for(MTGColor color:mc.getColors()) { doc.add(new Field("color", color.getCode(), fieldType)); } return doc; }
Example 14
Source File: TestDirectoryReader.java From lucene-solr with Apache License 2.0 | 5 votes |
static Document createDocument(String id) { Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setTokenized(false); customType.setOmitNorms(true); doc.add(newField("id", id, customType)); return doc; }
Example 15
Source File: TestConsistentFieldNumbers.java From lucene-solr with Apache License 2.0 | 4 votes |
private Field getField(int number) { int mode = number % 16; String fieldName = "" + number; FieldType customType = new FieldType(TextField.TYPE_STORED); FieldType customType2 = new FieldType(TextField.TYPE_STORED); customType2.setTokenized(false); FieldType customType3 = new FieldType(TextField.TYPE_NOT_STORED); customType3.setTokenized(false); FieldType customType4 = new FieldType(TextField.TYPE_NOT_STORED); customType4.setTokenized(false); customType4.setStoreTermVectors(true); customType4.setStoreTermVectorOffsets(true); FieldType customType5 = new FieldType(TextField.TYPE_NOT_STORED); customType5.setStoreTermVectors(true); customType5.setStoreTermVectorOffsets(true); FieldType customType6 = new FieldType(TextField.TYPE_STORED); customType6.setTokenized(false); customType6.setStoreTermVectors(true); customType6.setStoreTermVectorOffsets(true); FieldType customType7 = new FieldType(TextField.TYPE_NOT_STORED); customType7.setTokenized(false); customType7.setStoreTermVectors(true); customType7.setStoreTermVectorOffsets(true); FieldType customType8 = new FieldType(TextField.TYPE_STORED); customType8.setTokenized(false); customType8.setStoreTermVectors(true); customType8.setStoreTermVectorPositions(true); FieldType customType9 = new FieldType(TextField.TYPE_NOT_STORED); customType9.setStoreTermVectors(true); customType9.setStoreTermVectorPositions(true); FieldType customType10 = new FieldType(TextField.TYPE_STORED); customType10.setTokenized(false); customType10.setStoreTermVectors(true); customType10.setStoreTermVectorPositions(true); FieldType customType11 = new FieldType(TextField.TYPE_NOT_STORED); customType11.setTokenized(false); customType11.setStoreTermVectors(true); customType11.setStoreTermVectorPositions(true); FieldType customType12 = new FieldType(TextField.TYPE_STORED); customType12.setStoreTermVectors(true); customType12.setStoreTermVectorOffsets(true); customType12.setStoreTermVectorPositions(true); FieldType customType13 = new FieldType(TextField.TYPE_NOT_STORED); customType13.setStoreTermVectors(true); customType13.setStoreTermVectorOffsets(true); customType13.setStoreTermVectorPositions(true); FieldType customType14 = new FieldType(TextField.TYPE_STORED); customType14.setTokenized(false); customType14.setStoreTermVectors(true); customType14.setStoreTermVectorOffsets(true); customType14.setStoreTermVectorPositions(true); FieldType customType15 = new FieldType(TextField.TYPE_NOT_STORED); customType15.setTokenized(false); customType15.setStoreTermVectors(true); customType15.setStoreTermVectorOffsets(true); customType15.setStoreTermVectorPositions(true); switch (mode) { case 0: return new Field(fieldName, "some text", customType); case 1: return new TextField(fieldName, "some text", Field.Store.NO); case 2: return new Field(fieldName, "some text", customType2); case 3: return new Field(fieldName, "some text", customType3); case 4: return new Field(fieldName, "some text", customType4); case 5: return new Field(fieldName, "some text", customType5); case 6: return new Field(fieldName, "some text", customType6); case 7: return new Field(fieldName, "some text", customType7); case 8: return new Field(fieldName, "some text", customType8); case 9: return new Field(fieldName, "some text", customType9); case 10: return new Field(fieldName, "some text", customType10); case 11: return new Field(fieldName, "some text", customType11); case 12: return new Field(fieldName, "some text", customType12); case 13: return new Field(fieldName, "some text", customType13); case 14: return new Field(fieldName, "some text", customType14); case 15: return new Field(fieldName, "some text", customType15); default: return null; } }
Example 16
Source File: TestIndexWriter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testIndexStoreCombos() throws Exception { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); byte[] b = new byte[50]; for(int i=0;i<50;i++) b[i] = (byte) (i+77); Document doc = new Document(); FieldType customType = new FieldType(StoredField.TYPE); customType.setTokenized(true); Field f = new Field("binary", b, 10, 17, customType); // TODO: this is evil, changing the type after creating the field: customType.setIndexOptions(IndexOptions.DOCS); final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc1field1.setReader(new StringReader("doc1field1")); f.setTokenStream(doc1field1); FieldType customType2 = new FieldType(TextField.TYPE_STORED); Field f2 = newField("string", "value", customType2); final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc1field2.setReader(new StringReader("doc1field2")); f2.setTokenStream(doc1field2); doc.add(f); doc.add(f2); w.addDocument(doc); // add 2 docs to test in-memory merging final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc2field1.setReader(new StringReader("doc2field1")); f.setTokenStream(doc2field1); final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc2field2.setReader(new StringReader("doc2field2")); f2.setTokenStream(doc2field2); w.addDocument(doc); // force segment flush so we can force a segment merge with doc3 later. w.commit(); final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc3field1.setReader(new StringReader("doc3field1")); f.setTokenStream(doc3field1); final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false); doc3field2.setReader(new StringReader("doc3field2")); f2.setTokenStream(doc3field2); w.addDocument(doc); w.commit(); w.forceMerge(1); // force segment merge. w.close(); IndexReader ir = DirectoryReader.open(dir); Document doc2 = ir.document(0); IndexableField f3 = doc2.getField("binary"); b = f3.binaryValue().bytes; assertTrue(b != null); assertEquals(17, b.length, 17); assertEquals(87, b[0]); assertTrue(ir.document(0).getField("binary").binaryValue()!=null); assertTrue(ir.document(1).getField("binary").binaryValue()!=null); assertTrue(ir.document(2).getField("binary").binaryValue()!=null); assertEquals("value", ir.document(0).get("string")); assertEquals("value", ir.document(1).get("string")); assertEquals("value", ir.document(2).get("string")); // test that the terms were indexed. assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc1field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc2field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc3field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc1field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc2field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc3field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS); ir.close(); dir.close(); }
Example 17
Source File: TestDirectoryReaderReopen.java From lucene-solr with Apache License 2.0 | 4 votes |
private void doTestReopenWithCommit (Random random, Directory dir, boolean withReopen) throws IOException { IndexWriter iwriter = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random)) .setOpenMode(OpenMode.CREATE) .setMergeScheduler(new SerialMergeScheduler()) .setMergePolicy(newLogMergePolicy())); iwriter.commit(); DirectoryReader reader = DirectoryReader.open(dir); try { int M = 3; FieldType customType = new FieldType(TextField.TYPE_STORED); customType.setTokenized(false); FieldType customType2 = new FieldType(TextField.TYPE_STORED); customType2.setTokenized(false); customType2.setOmitNorms(true); FieldType customType3 = new FieldType(); customType3.setStored(true); for (int i=0; i<4; i++) { for (int j=0; j<M; j++) { Document doc = new Document(); doc.add(newField("id", i+"_"+j, customType)); doc.add(newField("id2", i+"_"+j, customType2)); doc.add(newField("id3", i+"_"+j, customType3)); iwriter.addDocument(doc); if (i>0) { int k = i-1; int n = j + k*M; Document prevItereationDoc = reader.document(n); assertNotNull(prevItereationDoc); String id = prevItereationDoc.get("id"); assertEquals(k+"_"+j, id); } } iwriter.commit(); if (withReopen) { // reopen DirectoryReader r2 = DirectoryReader.openIfChanged(reader); if (r2 != null) { reader.close(); reader = r2; } } else { // recreate reader.close(); reader = DirectoryReader.open(dir); } } } finally { iwriter.close(); reader.close(); } }
Example 18
Source File: DocumentsTestBase.java From lucene-solr with Apache License 2.0 | 4 votes |
protected void createIndex() throws IOException { indexDir = createTempDir(); Directory dir = newFSDirectory(indexDir); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new StandardAnalyzer()); FieldType titleType = new FieldType(); titleType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); titleType.setStored(true); titleType.setTokenized(true); titleType.setOmitNorms(true); FieldType authorType = new FieldType(); authorType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); authorType.setStored(true); authorType.setTokenized(true); authorType.setOmitNorms(false); FieldType textType = new FieldType(); textType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); textType.setStored(false); textType.setTokenized(true); textType.setStoreTermVectors(true); textType.setOmitNorms(false); FieldType downloadsType = new FieldType(); downloadsType.setDimensions(1, Integer.BYTES); downloadsType.setStored(true); Document doc1 = new Document(); doc1.add(new Field("title", "Pride and Prejudice", titleType)); doc1.add(new Field("author", "Jane Austen", authorType)); doc1.add(new Field("text", "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.", textType)); doc1.add(new SortedSetDocValuesField("subject", new BytesRef("Fiction"))); doc1.add(new SortedSetDocValuesField("subject", new BytesRef("Love stories"))); doc1.add(new Field("downloads", packInt(28533), downloadsType)); writer.addDocument(doc1); Document doc2 = new Document(); doc2.add(new Field("title", "Alice's Adventures in Wonderland", titleType)); doc2.add(new Field("author", "Lewis Carroll", authorType)); doc2.add(new Field("text", "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, ‘and what is the use of a book,’ thought Alice ‘without pictures or conversations?’", textType)); doc2.add(new SortedSetDocValuesField("subject", new BytesRef("Fantasy literature"))); doc2.add(new Field("downloads", packInt(18712), downloadsType)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new Field("title", "Frankenstein; Or, The Modern Prometheus", titleType)); doc3.add(new Field("author", "Mary Wollstonecraft Shelley", authorType)); doc3.add(new Field("text", "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking.", textType)); doc3.add(new SortedSetDocValuesField("subject", new BytesRef("Science fiction"))); doc3.add(new SortedSetDocValuesField("subject", new BytesRef("Horror tales"))); doc3.add(new SortedSetDocValuesField("subject", new BytesRef("Monsters"))); doc3.add(new Field("downloads", packInt(14737), downloadsType)); writer.addDocument(doc3); Document doc4 = new Document(); doc4.add(new Field("title", "A Doll's House : a play", titleType)); doc4.add(new Field("author", "Henrik Ibsen", authorType)); doc4.add(new Field("text", "", textType)); doc4.add(new SortedSetDocValuesField("subject", new BytesRef("Drama"))); doc4.add(new Field("downloads", packInt(14629), downloadsType)); writer.addDocument(doc4); Document doc5 = new Document(); doc5.add(new Field("title", "The Adventures of Sherlock Holmes", titleType)); doc5.add(new Field("author", "Arthur Conan Doyle", authorType)); doc5.add(new Field("text", "To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex.", textType)); doc5.add(new SortedSetDocValuesField("subject", new BytesRef("Fiction"))); doc5.add(new SortedSetDocValuesField("subject", new BytesRef("Detective and mystery stories"))); doc5.add(new Field("downloads", packInt(12828), downloadsType)); writer.addDocument(doc5); writer.commit(); writer.close(); dir.close(); }
Example 19
Source File: LuceneIndexFromSemrepTriples.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * This class indexes the file passed as a parameter, writing to the index passed as a parameter. * Each predication is indexed as an individual document, with the fields "subject", "predicate", and "object" * @throws IOException */ static void indexDoc(IndexWriter fsWriter, File triplesTextFile) throws IOException { BufferedReader theReader = new BufferedReader(new FileReader(triplesTextFile)); int linecnt = 0; String lineIn; while ((lineIn = theReader.readLine()) != null) { java.util.StringTokenizer theTokenizer = new java.util.StringTokenizer(lineIn,"\t"); // Output progress counter. if( ( ++linecnt % 10000 == 0 ) || ( linecnt < 10000 && linecnt % 1000 == 0 ) ){ VerbatimLogger.info((linecnt) + " ... "); } try { if (theTokenizer.countTokens() < 3) { VerbatimLogger.warning( "Line in predication file does not have three delimited fields: " + lineIn + "\n"); lineIn = theReader.readLine(); continue; } String subject = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_").replaceAll("\\|\\|\\|.*", ""); String subject_CUI = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); String subject_semtype = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); String predicate = theTokenizer.nextToken().trim().toUpperCase().replaceAll(" ", "_"); String object = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_").replaceAll("\\|\\|\\|.*", ""); String object_CUI = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); String object_semtype = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); String PMID = theTokenizer.nextToken(); String source = theTokenizer.nextToken(); Document doc = new Document(); doc.add(new TextField("subject", subject, Field.Store.YES)); doc.add(new TextField("subject_CUI", subject_CUI, Field.Store.YES)); doc.add(new TextField("subject_semtype", subject_semtype, Field.Store.YES)); doc.add(new TextField("predicate", predicate, Field.Store.YES)); doc.add(new TextField("object", object, Field.Store.YES)); doc.add(new TextField("object_CUI", object_CUI, Field.Store.YES)); doc.add(new TextField("object_semtype", object_semtype, Field.Store.YES)); doc.add(new TextField("predication",subject+predicate+object, Field.Store.NO)); doc.add(new TextField("PMID",PMID, Field.Store.YES)); //create new FieldType to store term positions (TextField is not sufficiently configurable) FieldType ft = new FieldType(); //the next line was commented out when the original index was buildt (v1.0) //ft.setIndexed(true); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); ft.setStored(true); ft.setTokenized(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); Field contentsField = new Field("source", source, ft); doc.add(contentsField); fsWriter.addDocument(doc); } catch (Exception e) { System.out.println(lineIn); e.printStackTrace(); } } VerbatimLogger.info("\n"); // Newline after line counter prints. theReader.close(); }
Example 20
Source File: Index.java From dacapobench with Apache License 2.0 | 4 votes |
/** * Index either a file or a directory tree. * * @param writer * @param file * @throws IOException */ void indexDocs(IndexWriter writer, File file) throws IOException { /* Strip the absolute part of the path name from file name output */ int scratchP = scratch.getCanonicalPath().length() + 1; /* do not try to index files that cannot be read */ if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { Arrays.sort(files); for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { System.out.println("adding " + file.getCanonicalPath().substring(scratchP)); try { Document doc = new Document(); FieldType docFT = new FieldType(); docFT.setTokenized (false); docFT.setStored (true); docFT.setIndexOptions (IndexOptions.DOCS); // Add the path of the file as a field named "path". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.add(new Field("path", file.getPath(), docFT)); // Add the last modified date of the file a field named "modified". Use // a field that is indexed (i.e. searchable), but don't tokenize the field // into words. doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), docFT)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in the system's default encoding. // If that's not the case searching for special characters will fail. docFT.setTokenized (true); docFT.setStored (false); doc.add(new Field("contents", new FileReader(file), docFT)); writer.addDocument(doc); } // at least on windows, some temporary files raise this exception with // an "access denied" message // checking if the file can be read doesn't help catch (FileNotFoundException fnfe) { } } } }