org.apache.lucene.analysis.KeywordAnalyzer Java Examples

The following examples show how to use org.apache.lucene.analysis.KeywordAnalyzer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DocumentUtil.java    From netbeans with Apache License 2.0 5 votes vote down vote up
public static Analyzer createAnalyzer() {
    final PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_IDENTS, new WhitespaceAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_FEATURE_IDENTS, new WhitespaceAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_CASE_INSENSITIVE_FEATURE_IDENTS, new DocumentUtil.LCWhitespaceAnalyzer());
    return analyzer;
}
 
Example #2
Source File: IndexManager.java    From netbeans with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a transactional document based index.
 * The returned {@link DocumentIndex} is not cached, next call with the same arguments returns a different instance
 * of {@link DocumentIndex}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param cache the document caching provider
 * @return the document based index
 * @since 2.19
 */
@NonNull
public static DocumentIndex.Transactional createTransactionalDocumentIndex (
        final @NonNull File cacheFolder,
        final @NonNull DocumentIndexCache cache) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);     //NOI18N
    Parameters.notNull("cache", cache);                 //NOI18N
    return createTransactionalDocumentIndex(
            createTransactionalIndex(cacheFolder, new KeywordAnalyzer()),
            cache);
}
 
Example #3
Source File: IndexTransactionTest.java    From netbeans with Apache License 2.0 5 votes vote down vote up
private void setupLuceneIndex() throws Exception {
    clearWorkDir();
    final File wd = getWorkDir();
    cache = new File(wd,"cache");
    cache.mkdirs();
    index = LuceneIndex.create(cache, new KeywordAnalyzer());
    
}
 
Example #4
Source File: AsyncCloseTest.java    From netbeans with Apache License 2.0 5 votes vote down vote up
public void testAsyncClose() throws Exception {
    final CountDownLatch slot = new CountDownLatch(1);
    final CountDownLatch signal = new CountDownLatch(1);
    final  CountDownLatch done = new CountDownLatch(1);
    final AtomicReference<Exception> exception = new AtomicReference<Exception>();

    final Index index = IndexManager.createTransactionalIndex(indexFolder, new KeywordAnalyzer());
    final Thread worker = new Thread(new Runnable() {
        @Override
        public void run() {
            try {
                index.store(
                   new ArrayList<String>(Arrays.asList("foo")), //NOI18N
                   Collections.<String>emptySet(),
                   new TestInsertConvertor(slot, signal),
                   new TestDeleteConvertor(),
                   true);
            } catch (Exception ex) {
                exception.set(ex);
            } finally {
                done.countDown();
            }
        }
    });
    worker.start();

    signal.await();
    slot.countDown();
    index.close();
    done.await();
    assertNull(exception.get());
}
 
Example #5
Source File: LayeredDocumentIndex.java    From netbeans with Apache License 2.0 5 votes vote down vote up
@NonNull
private synchronized DocumentIndex2 getOverlay() throws IOException {
    if (overlay == null) {
        overlay = (DocumentIndex2) IndexManager.createDocumentIndex(IndexManager.createMemoryIndex(new KeywordAnalyzer()));
    }
    return overlay;
}
 
Example #6
Source File: IndexBuilder.java    From exhibitor with Apache License 2.0 5 votes vote down vote up
public void open() throws Exception
{
    if ( !directory.exists() && !directory.mkdirs() )
    {
        throw new IOException("Could not make: " + directory);
    }

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new KeywordAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    niofsDirectory = new NIOFSDirectory(directory, new SingleInstanceLockFactory());
    writer = new IndexWriter(niofsDirectory, conf);
}
 
Example #7
Source File: LuceneIndexTest.java    From netbeans with Apache License 2.0 4 votes vote down vote up
public void testIsValid() throws Exception {
    final File wd = getWorkDir();
    final File cache = new File(wd,"cache");
    cache.mkdirs();
    final LuceneIndex index = LuceneIndex.create(cache, new KeywordAnalyzer());
    //Empty index => invalid
    assertEquals(Index.Status.EMPTY, index.getStatus(true));

    clearValidityCache(index);
    List<String> refs = new ArrayList<String>();
    refs.add("A");
    Set<String> toDel = new HashSet<String>();
    index.store(
            refs,
            toDel,
            new StrToDocConvertor("resources"),
            new StrToQueryCovertor("resource"),
            true);
    //Existing index => valid
    assertEquals(Index.Status.VALID, index.getStatus(true));
    assertTrue(cache.listFiles().length>0);

    clearValidityCache(index);
    createLock(index);
    //Index with orphan lock => invalid
    assertEquals(Index.Status.INVALID, index.getStatus(true));
    assertTrue(cache.listFiles().length==0);

    refs.add("B");
    clearValidityCache(index);
    index.store(
            refs,
            toDel,
            new StrToDocConvertor("resources"),
            new StrToQueryCovertor("resource"),
            true);
    assertEquals(Index.Status.VALID, index.getStatus(true));
    assertTrue(cache.listFiles().length>0);

    //Broken index => invalid
    clearValidityCache(index);
    File bt = null;
    for (File file : cache.listFiles()) {
        // either compound file or filds information must be present
        if (file.getName().endsWith(".cfs") || file.getName().endsWith(".fnm")) {
            bt = file;
            break;
        }
    }
    assertNotNull(bt);
    FileOutputStream out = new FileOutputStream(bt);
    try {
        out.write(new byte[] {0,0,0,0,0,0,0,0,0,0}, 0, 10);
    } finally {
        out.close();
    }
    assertEquals(Index.Status.INVALID, index.getStatus(true));
    assertTrue(cache.listFiles().length==0);
    
}
 
Example #8
Source File: AsyncCloseTest.java    From netbeans with Apache License 2.0 4 votes vote down vote up
public void testConcurrentReadWrite() throws Exception {
    final Index index = IndexManager.createTransactionalIndex(indexFolder, new KeywordAnalyzer());
    index.store(
        new ArrayList<String>(Arrays.asList("a")), //NOI18N
        Collections.<String>emptySet(),
        new TestInsertConvertor(),
        new TestDeleteConvertor(),
        true);

    final CountDownLatch slot = new CountDownLatch(1);
    final CountDownLatch signal = new CountDownLatch(1);
    final CountDownLatch done = new CountDownLatch(1);
    final AtomicReference<Exception> result = new AtomicReference<Exception>();

    final Thread worker = new Thread(new Runnable() {
        @Override
        public void run() {
            try {
                index.store(
                       new ArrayList<String>(Arrays.asList("b")), //NOI18N
                       Collections.<String>emptySet(),
                       new TestInsertConvertor(slot, signal),
                       new TestDeleteConvertor(),
                       true);
            } catch (Exception e) {
                result.set(e);
            } finally {
                done.countDown();
            }
        }
    });

    worker.start();
    signal.await();

    final Collection<String> data = new ArrayList<String>();
    index.query(
        data,
        new Convertor<Document,String>(){
            @Override
            public String convert(Document p) {
                return p.get(FLD_KEY);
            }
        },
        null,
        new AtomicBoolean(),
        new PrefixQuery(new Term(FLD_KEY,""))); //NOI18N
    assertEquals(1, data.size());
    assertEquals("a", data.iterator().next());  //NOI18N
    slot.countDown();
    done.await();
    assertNull(result.get());
}
 
Example #9
Source File: AnchorIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		log.info("Loading support datasets...");
		
		File all_anchors = new WikipediaAnchorParser(lang).getFile();
		long numAnchors = ExternalSortUtils.wcl(all_anchors);
		AnchorIterator iterator = new AnchorIterator(all_anchors);
		
		IntSet people = new PeopleWIDs(lang).getDataset();
		
//		IndexSearcher articles = Indexes.getSearcher(RepositoryDirs.WIKIPEDIA.getPath(lang));
		IndexSearcher articles = openWikipediaIndex(lang);
		//QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new WhitespaceAnalyzer(Version.LUCENE_34));
		QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new StandardAnalyzer(Version.LUCENE_34, new HashSet<String>()));
		
		IndexWriter index = new IndexWriter(FSDirectory.open(workingDir.getAbsoluteFile()), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fId = new Field(FIELD_ID, "", Store.YES, Index.NOT_ANALYZED);
		Field fText = new Field(FIELD_TEXT, "", Store.YES, Index.NOT_ANALYZED);
		Field fObject = new Field(FIELD_OBJECT, "", Store.YES, Index.NO);
		
		doc.add(fId);
		doc.add(fText);
		doc.add(fObject);
		
//		Field fOriginal = new Field(FIELD_ORIGINAL, "", Store.YES, Index.ANALYZED);
//		Field fWID = new Field(FIELD_WID, "", Store.NO, Index.ANALYZED);
		
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "lines", "anchors", "searches", "indexed", "0-freq","dropped");
		plog.setEnd(0, numAnchors);
		plog.start("Support datasets loaded, now parsing...");
		int id=0;
		while(iterator.next())
		{
			plog.update(0, iterator.scroll);
			plog.update(1);
			String anchorText = iterator.anchor;
			
			int freq = freq(iterator.originals, articles, queryParser);
			plog.update(2, iterator.originals.size());
			if (freq == 0) plog.update(4);
			
			Anchor anchorObj = Anchor.build(id, iterator.links, freq, people);
			if (anchorObj == null){
				plog.update(5);
				continue;
			}
			
			String anchorSerial = Anchor.serialize(anchorObj);
			fId.setValue(Integer.toString(++id));
			fText.setValue(anchorText);
			fObject.setValue(anchorSerial);
			
			for(int page : anchorObj){
				Field fWID = new Field(FIELD_WID, Integer.toString(page), Store.YES, Index.NOT_ANALYZED);
//				fWID.setBoost(iterator.links.get(page));
				doc.add(fWID);
			}
			for(String original : iterator.originals) {
				doc.add(new Field(FIELD_ORIGINAL, original, Store.YES, Index.NOT_ANALYZED));
			}
			
			index.addDocument(doc);
			plog.update(3);
			
			doc.removeFields(FIELD_ORIGINAL);
			doc.removeFields(FIELD_WID);
		}
		plog.stop();
		iterator.close();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		log.info("Done.");
	}
 
Example #10
Source File: TopicIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		
		IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
		Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
		
		IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
		Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
		Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
		Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
		doc.add(fWID);
		doc.add(fTitle);
		doc.add(fAbstract);
		doc.add(fBestAnchor);
				
		
		int max = articles.maxDoc();
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
		plog.setEnd(max);
		plog.start("Start indexing...");
		
		for(int i=0; i<max; i++)
		{
			plog.update(0);
			Document oldDoc = articles.document(i);
			PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
			if (type == PageType.TOPIC)
			{
				int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
				fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
				
				String bestAnchor = bestAnchorMap.get(wid);
				if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
				fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
				
				String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
				if (cats != null) {
					for (int j=0; j<cats.length; j++)
						doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
				}
				
				index.addDocument(doc);
				plog.update(1);
				
				doc.removeFields(FIELD_CAT);
			}
		}
		
		plog.stop();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		
		//we cannot call this because the index is still in the temporary dir
		//so TopicDocs will be created using old index
//		log.info("Index Done, now creating WID->DOC_ID map");
//		
//		TopicDocs td = new TopicDocs(lang);
//		td.forceParsing();
		
		log.info("Done.");
	}
 
Example #11
Source File: IndexManager.java    From netbeans with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param cache the document caching provider
 * @return the document based index
 * @since 2.18.0
 */
public static DocumentIndex createDocumentIndex (
        final @NonNull File cacheFolder,
        final @NonNull DocumentIndexCache cache) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);     //NOI18N
    Parameters.notNull("cache", cache);                 //NOI18N
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer()), cache);
}
 
Example #12
Source File: IndexManager.java    From netbeans with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param isWritable <code>false</code> if it is read only index
 * @return the document based index
 * @since 2.27.1
 */
public static DocumentIndex createDocumentIndex (final @NonNull File cacheFolder, boolean isWritable) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer(), isWritable));
}
 
Example #13
Source File: IndexManager.java    From netbeans with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @return the document based index
 * @since 1.1
 */
public static DocumentIndex createDocumentIndex (final @NonNull File cacheFolder) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer()));
}