org.apache.lucene.index.MultiFields Java Examples

The following examples show how to use org.apache.lucene.index.MultiFields. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransportFieldStatsTransportAction.java    From Elasticsearch with Apache License 2.0 7 votes vote down vote up
@Override
protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) {
    ShardId shardId = request.shardId();
    Map<String, FieldStats> fieldStats = new HashMap<>();
    IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex());
    MapperService mapperService = indexServices.mapperService();
    IndexShard shard = indexServices.shardSafe(shardId.id());
    try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) {
        for (String field : request.getFields()) {
            MappedFieldType fieldType = mapperService.fullName(field);
            if (fieldType != null) {
                IndexReader reader = searcher.reader();
                Terms terms = MultiFields.getTerms(reader, field);
                if (terms != null) {
                    fieldStats.put(field, fieldType.stats(terms, reader.maxDoc()));
                }
            } else {
                throw new IllegalArgumentException("field [" + field + "] doesn't exist");
            }
        }
    } catch (IOException e) {
        throw ExceptionsHelper.convertToElastic(e);
    }
    return new FieldStatsShardResponse(shardId, fieldStats);
}
 
Example #2
Source File: FieldsConsumer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Merges in the fields from the readers in 
 *  <code>mergeState</code>. The default implementation skips
 *  and maps around deleted documents, and calls {@link #write(Fields,NormsProducer)}.
 *  Implementations can override this method for more sophisticated
 *  merging (bulk-byte copying, etc). */
public void merge(MergeState mergeState, NormsProducer norms) throws IOException {
  final List<Fields> fields = new ArrayList<>();
  final List<ReaderSlice> slices = new ArrayList<>();

  int docBase = 0;

  for(int readerIndex=0;readerIndex<mergeState.fieldsProducers.length;readerIndex++) {
    final FieldsProducer f = mergeState.fieldsProducers[readerIndex];

    final int maxDoc = mergeState.maxDocs[readerIndex];
    f.checkIntegrity();
    slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
    fields.add(f);
    docBase += maxDoc;
  }

  Fields mergedFields = new MappedMultiFields(mergeState, 
                                              new MultiFields(fields.toArray(Fields.EMPTY_ARRAY),
                                                              slices.toArray(ReaderSlice.EMPTY_ARRAY)));
  write(mergedFields, norms);
}
 
Example #3
Source File: MtasFieldsConsumer.java    From mtas with Apache License 2.0 6 votes vote down vote up
@Override
public void merge(MergeState mergeState) throws IOException {
  final List<Fields> fields = new ArrayList<>();
  final List<ReaderSlice> slices = new ArrayList<>();

  int docBase = 0;

  for (int readerIndex = 0; readerIndex < mergeState.fieldsProducers.length; readerIndex++) {
    final FieldsProducer f = mergeState.fieldsProducers[readerIndex];

    final int maxDoc = mergeState.maxDocs[readerIndex];
    f.checkIntegrity();
    slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
    fields.add(f);
    docBase += maxDoc;
  }

  Fields mergedFields = new MappedMultiFields(mergeState,
      new MultiFields(fields.toArray(Fields.EMPTY_ARRAY),
          slices.toArray(ReaderSlice.EMPTY_ARRAY)));
  write(mergedFields);
}
 
Example #4
Source File: LuceneUtils.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * @param flagConfig Contains all information necessary for configuring LuceneUtils.
 *        {@link FlagConfig#luceneindexpath()} must be non-empty. 
 */
public LuceneUtils(FlagConfig flagConfig) throws IOException {
  if (flagConfig.luceneindexpath().isEmpty()) {
    throw new IllegalArgumentException(
        "-luceneindexpath is a required argument for initializing LuceneUtils instance.");
  }

  this.compositeReader = DirectoryReader.open(
      FSDirectory.open(FileSystems.getDefault().getPath(flagConfig.luceneindexpath())));
  this.leafReader = SlowCompositeReaderWrapper.wrap(compositeReader);
  MultiFields.getFields(compositeReader);
  this.flagConfig = flagConfig;
  if (!flagConfig.stoplistfile().isEmpty())
    loadStopWords(flagConfig.stoplistfile());

  if (!flagConfig.startlistfile().isEmpty())
    loadStartWords(flagConfig.startlistfile());

  VerbatimLogger.info("Initialized LuceneUtils from Lucene index in directory: " + flagConfig.luceneindexpath() + "\n");
  VerbatimLogger.info("Fields in index are: " + String.join(", ", this.getFieldNames()) + "\n");
}
 
Example #5
Source File: TermDocIterable.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private boolean getNext() {
  try {
    int next = docsEnum.nextDoc();
    if (next == DocIdSetIterator.NO_MORE_DOCS) {
      return false;
    }
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    if (liveDocs != null) {
      while (!liveDocs.get(docsEnum.docID())) {
        next = docsEnum.nextDoc();
      }
    }
    return next == DocIdSetIterator.NO_MORE_DOCS ? false : true;
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
 
Example #6
Source File: LuceneIndexCorpus.java    From word2vec-lucene with Apache License 2.0 6 votes vote down vote up
@Override
public void learnVocab() throws IOException {
  super.learnVocab();

  final String field = ((LuceneIndexConfig)config).getField();
  final Terms terms = MultiFields.getTerms(reader, field);
  final BytesRef maxTerm = terms.getMax();
  final BytesRef minTerm = terms.getMin();
  Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
  IndexSearcher searcher = new IndexSearcher(reader);
  topDocs = searcher.search(q, Integer.MAX_VALUE);

  TermsEnum termsEnum = null;
  termsEnum = terms.iterator(termsEnum);

  termsEnum.seekCeil(new BytesRef());
  BytesRef term = termsEnum.term();
  while(term != null){
    int p = addWordToVocab(term.utf8ToString());
    vocab[p].setCn((int)termsEnum.totalTermFreq());
    term = termsEnum.next();
  }
}
 
Example #7
Source File: CustomSpellCheckListner.java    From customized-symspell with MIT License 5 votes vote down vote up
/**
 * Relod method of spellcheck listner
 * @param newSearcher
 * @param checker
 * @throws IOException
 * @throws SpellCheckException
 */
public void reload(SolrIndexSearcher newSearcher, SpellChecker checker)
    throws IOException, SpellCheckException {

  DirectoryReader productsIndexReader = newSearcher.getIndexReader();
  Fields fields = MultiFields.getFields(productsIndexReader);
  IndexSchema schema = newSearcher.getCore().getLatestSchema();
  long time = System.currentTimeMillis();
  for (String field : fields) {
    if (!fieldArr.contains(field)) {
      continue;
    }
    FieldType type = schema.getField(field).getType();
    int insertionsCount = 0;
    for (TermsEnum iterator = fields.terms(field).iterator(); iterator.next() != null; ) {
      BytesRef term = iterator.term();
      CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
      type.indexedToReadable(term, charsRefBuilder);
      insertionsCount++;
      checker.getDataHolder().addItem(
          new DictionaryItem(charsRefBuilder.toString().trim(), (double) iterator.totalTermFreq(),
              0.0));
    }
    log.info("Spellcheck Dictionary populated for Field Name {}, Count {}", field,
        insertionsCount);
  }
  log.info("Data for SpellChecker  was populated. Time={} ms",
      (System.currentTimeMillis() - time));
}
 
Example #8
Source File: SORecommender.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<String> getAllIndexTags(String INDEX_DIRECTORY) {
	Collection<String> result = new HashSet<String>();
	try {
		IndexReader luceneIndexReader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_DIRECTORY)));
		result = MultiFields.getIndexedFields(luceneIndexReader);
	} catch (IOException e) {
		logger.error(e.getMessage());
	}

	List<String> sortedList = new ArrayList<String>(result);
	Collections.sort(sortedList);

	return sortedList;
}
 
Example #9
Source File: TermFreq.java    From SourcererCC with GNU General Public License v3.0 5 votes vote down vote up
private void dummy() throws IOException {
    Fields fields = MultiFields.getFields(this.reader);
    Terms terms = fields.terms("field");
    TermsEnum iterator = terms.iterator(null);
    BytesRef byteRef = null;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset,
                byteRef.length);
        Term termInstance = new Term("tokens", term);
        long termFreq = this.reader.totalTermFreq(termInstance);
        this.TermFreqMap.put(term, termFreq);
        System.out.println(termFreq);
    }
}
 
Example #10
Source File: WordScorer.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
    this(reader, MultiFields.getTerms(reader, field), field, realWordLikelyHood, separator);
}
 
Example #11
Source File: AlfrescoLukeRequestHandler.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example #12
Source File: IndexLoader.java    From solr-autocomplete with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws CorruptIndexException, IOException, SolrServerException {

        if (args.length < 3) {
            System.err.println("Usage: java -Dfile.encoding=UTF8 -Dclient.encoding.override=UTF-8 -Xmx256m -Xms256m -server " + IndexLoader.class.getName()
                    + " </path/to/index> <AutoCompleteSolrUrl> <indexField1,acField1> [indexField2,acField2 ... ]");
            System.exit(0);
        }
        Map<String,String> fieldMap = getFieldMapping(args, 2);
        DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(args[0])));
        int docs = reader.maxDoc();
        SolrClient solr = new ConcurrentUpdateSolrClient.Builder(args[1]).withQueueSize(10000).withThreadCount(2).build();
        Set<SolrInputDocument> batch = new HashSet<SolrInputDocument>(1000);
        
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        
        // go through all docs in the index
        for (int i = 0; i < docs; i++) {
            // process doc only if not deleted
            if (liveDocs == null || liveDocs.get(i)) {
                // loop through all fields to be looked at
                SolrInputDocument doc = new SolrInputDocument();
                Iterator<String> iter = fieldMap.keySet().iterator();
                
                boolean phraseFieldEmpty = false;
                
                while (iter.hasNext()) {
                    String indexField = iter.next();
                    String acField = fieldMap.get(indexField);
                    IndexableField field = reader.document(i).getField(indexField);
                    String value = field != null ? reader.document(i).getField(indexField).stringValue() : null;
                    
                    if (field != null && value != null && !value.isEmpty()) {
                      doc.addField(acField, value);
                    } else {
                      // not very relevant piece of info
                      // System.err.println("Field is null or empty, skipping: " + indexField);
                      
                      if (acField.equalsIgnoreCase("phrase")) {
                        System.err.println("Since AC phrase field would be null, this doc will not be created: " + reader.document(i));
                        phraseFieldEmpty = true;
                        break;
                      }
                    }
                }

                if (!phraseFieldEmpty) {
                  solr.add(doc);
                  if (docs % 1000 == 0) {
                    System.out.println("Docs: " + docs);
                  }
                }
            }
        }
        if (!batch.isEmpty())
            solr.add(batch);
        reader.close();
        System.out.println("Optimizing...");
        solr.optimize();
        solr.close();
    }
 
Example #13
Source File: TermFreqAnalyser.java    From Siamese with GNU General Public License v3.0 4 votes vote down vote up
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example #14
Source File: IndexRequestMasterListenerIT.java    From development with Apache License 2.0 4 votes vote down vote up
private void assertDocsInIndex(final Class<?> clazz, final String comment,
        final int expectedNumDocs, final int expectedNumIndexedAttributes,
        final List<String> expectedAttributes) throws Exception {
    Boolean evaluationTookPlace = runTX(new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            boolean evaluatedIndex = false;
            Session session = dm.getSession();
            if (session != null) {
                FullTextSession fullTextSession = Search
                        .getFullTextSession(session);
                SearchFactory searchFactory = fullTextSession
                        .getSearchFactory();
                IndexReader reader = searchFactory.getIndexReaderAccessor()
                        .open(clazz);

                try {
                    assertEquals(comment, expectedNumDocs,
                            reader.numDocs());
                    if (expectedNumDocs > 0) {
                        final FieldInfos indexedFieldNames = MultiFields
                                .getMergedFieldInfos(reader);
                        for (String expectedAttr : expectedAttributes) {
                            assertNotNull(
                                    "attribute " + expectedAttr
                                            + " does not exist in index: "
                                            + indexedFieldNames,
                                    indexedFieldNames
                                            .fieldInfo(expectedAttr));
                        }
                        assertNotNull(
                                "attribute \"key\" does not exist in index: "
                                        + indexedFieldNames,
                                indexedFieldNames.fieldInfo("key"));
                        assertNotNull(
                                "attribute \"_hibernate_class\" does not exist in index: "
                                        + indexedFieldNames,
                                indexedFieldNames
                                        .fieldInfo("_hibernate_class"));
                        assertEquals(
                                "More or less attributes indexed than expected, attributes retrieved from index: "
                                        + indexedFieldNames,
                                expectedNumIndexedAttributes + 2,
                                indexedFieldNames.size());
                        evaluatedIndex = true;
                    }
                } finally {
                    searchFactory.getIndexReaderAccessor().close(reader);
                }
            }

            return Boolean.valueOf(evaluatedIndex);
        }
    });

    if (expectedNumDocs > 0) {
        Assert.assertTrue("Index not found, no evaluation took place",
                evaluationTookPlace.booleanValue());
    }
}
 
Example #15
Source File: TermSearcher.java    From SourcererCC with GNU General Public License v3.0 4 votes vote down vote up
public synchronized void searchWithPosition(int queryTermsSeen) {
    if (null != this.reader) {
        if (null != this.reader.getContext()) {
            if (null != this.reader.getContext().leaves()) {
                Term term = new Term("tokens", this.searchTerm);
                for (AtomicReaderContext ctx : this.reader.getContext()
                        .leaves()) {
                    int base = ctx.docBase;
                    // SpanTermQuery spanQ = new SpanTermQuery(term);
                    try {
                        DocsAndPositionsEnum docEnum = MultiFields
                                .getTermPositionsEnum(ctx.reader(),
                                        MultiFields.getLiveDocs(ctx
                                                .reader()), "tokens", term
                                                .bytes());
                        if (null != docEnum) {
                            int doc = DocsEnum.NO_MORE_DOCS;
                            while ((doc = docEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                                long docId = doc + base;
                                CandidateSimInfo simInfo = null;
                                if (this.simMap.containsKey(docId)) {
                                    simInfo = this.simMap.get(docId);
                                    simInfo.similarity = simInfo.similarity
                                            + Math.min(freqTerm,
                                                    docEnum.freq());

                                } else {
                                    if (earlierDocs.contains(docId))
                                        continue;

                                    Document d = SearchManager.searcher
                                            .get(shard).getDocument(docId);
                                    long candidateId = Long.parseLong(d
                                            .get("id"));
                                    // Get rid of these early -- we're only
                                    // looking for candidates
                                    // whose ids are smaller than the query
                                    if (candidateId >= this.queryId) {
                                        // System.out.println("Query " +
                                        // this.queryId +
                                        // ", getting rid of " +
                                        // candidateId);
                                        earlierDocs.add(docId);
                                        continue; // we reject the candidate
                                    }

                                    simInfo = new CandidateSimInfo();
                                    simInfo.doc = d;
                                    simInfo.candidateSize = Integer
                                            .parseInt(d.get("size"));
                                    simInfo.similarity = Math.min(freqTerm,
                                            docEnum.freq());
                                    // System.out.println("before putting in simmap "+
                                    // Util.debug_thread());
                                    this.simMap.put(docId, simInfo);
                                    // System.out.println("after putting in simmap "+
                                    // Util.debug_thread());
                                }
                                simInfo.queryMatchPosition = queryTermsSeen;
                                int candidatePos = docEnum.nextPosition();
                                simInfo.candidateMatchPosition = candidatePos
                                        + docEnum.freq();
                                if (!Util.isSatisfyPosFilter(
                                        this.simMap.get(docId).similarity,
                                        this.querySize, queryTermsSeen,
                                        simInfo.candidateSize,
                                        simInfo.candidateMatchPosition,
                                        this.computedThreshold)) {
                                    // System.out.println("before removing in simmap "+
                                    // Util.debug_thread());
                                    this.simMap.remove(docId);
                                    // System.out.println("after removing in simmap "+
                                    // Util.debug_thread());
                                }
                            }
                        } else {
                            logger.trace("docEnum is null, " + base
                                    + ", term: " + this.searchTerm
                                    + Util.debug_thread());
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                        logger.error("exception caught " + e.getMessage()
                                + Util.debug_thread() + " search term:"
                                + this.searchTerm);
                    }
                }
            } else {
                logger.debug("leaves are null, " + this.searchTerm
                        + Util.debug_thread());
            }
        } else {
            logger.debug("getContext is null, " + this.searchTerm
                    + Util.debug_thread());
        }
    } else {
        logger.debug("this.reader is null, " + this.searchTerm
                + Util.debug_thread());
    }
}
 
Example #16
Source File: LuceneIndexTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 2 votes vote down vote up
/**
 * NB: this is a convenient but very slow way of getting termDocs. It is sufficient for testing purposes.
 *
 * @throws IOException
 */
private static PostingsEnum termDocs(IndexReader reader, Term term) throws IOException {
	return MultiFields.getTermDocsEnum(reader, term.field(), term.bytes());
}