java source code of QueryParser

/*
 * FXDesktopSearch Copyright 2013 Mirko Sertic
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.mirkosertic.desktopsearch;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

class QueryParser {

    private final Analyzer analyzer;

    public QueryParser(final Analyzer aAnalyzer) {
        analyzer = aAnalyzer;
    }

    private String toToken(final String aToken, final String aSearchField) throws IOException {
        try (final var theStream = analyzer.tokenStream(aSearchField, aToken)) {
            final var theAttribute = theStream.getAttribute(CharTermAttribute.class);
            theStream.reset();
            if (theStream.incrementToken()) {
                return theAttribute.toString();
            }
        }
        return "";
    }

    private void addToBooleanQuery(
            final List<String> aTermList, final String aFieldName, final BooleanQuery.Builder aQuery, final BooleanClause.Occur aOccour)
            throws IOException {
        for (final var theTerm : aTermList) {
            if (QueryUtils.isWildCard(theTerm)) {
                aQuery.add(new WildcardQuery(new Term(aFieldName, theTerm)), aOccour);
            } else if (QueryUtils.isFuzzy(theTerm)) {
                aQuery.add(new FuzzyQuery(new Term(aFieldName, theTerm)), aOccour);
            } else {
                final var theTokenizedTerm = toToken(theTerm, aFieldName);
                if (!StringUtils.isEmpty(theTokenizedTerm)) {
                    aQuery.add(new TermQuery(new Term(aFieldName, theTokenizedTerm)), aOccour);
                }
            }
        }

    }

    public Query parse(final String aQuery, final String aSearchField) throws IOException {

        final var theTokenizer = new QueryTokenizer(aQuery);

        // Now we have the terms, lets construct the query

        final var theResult = new BooleanQuery.Builder();

        if (!theTokenizer.getRequiredTerms().isEmpty()) {

            final List<SpanQuery> theSpans = new ArrayList<>();
            for (final var theTerm : theTokenizer.getRequiredTerms()) {
                if (QueryUtils.isWildCard(theTerm)) {
                    theSpans.add(new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(aSearchField, theTerm))));
                } else if (QueryUtils.isFuzzy(theTerm)) {
                    theSpans.add(new SpanMultiTermQueryWrapper<>(new FuzzyQuery(new Term(aSearchField, theTerm))));
                } else {
                    // Ok, we need to check of the token would be removed due to stopwords and so on
                    final var theTokenizedTerm = toToken(theTerm, aSearchField);
                    if (!StringUtils.isEmpty(theTokenizedTerm)) {
                        theSpans.add(new SpanTermQuery(new Term(aSearchField, theTokenizedTerm)));
                    }
                }
            }

            if (theSpans.size() > 1) {
                // This is the original span, so we boost it a lot
                final SpanQuery theExactMatchQuery = new SpanNearQuery(theSpans.toArray(new SpanQuery[theSpans.size()]), 0, true);
                theResult.add(new BoostQuery(theExactMatchQuery, 61), BooleanClause.Occur.SHOULD);

                // We expect a maximum edit distance of 10 between the searched terms in any order
                // This seems to be the most useful value
                final var theMaxEditDistance = 10;
                for (var theSlop = 0; theSlop < theMaxEditDistance; theSlop++) {
                    final SpanQuery theNearQuery = new SpanNearQuery(theSpans.toArray(new SpanQuery[theSpans.size()]), theSlop, false);
                    theResult.add(new BoostQuery(theNearQuery, 50 + theMaxEditDistance - theSlop), BooleanClause.Occur.SHOULD);
                }
            }

            // Finally, we just add simple term queries, but do not boost them
            // This makes sure that at least the searched terms
            // are found in the document
            addToBooleanQuery(theTokenizer.getRequiredTerms(), aSearchField, theResult, BooleanClause.Occur.MUST);
        }


        // Finally, add the terms that must not occur in the search result
        addToBooleanQuery(theTokenizer.getNotRequiredTerms(), aSearchField, theResult, BooleanClause.Occur.MUST_NOT);

        return theResult.build();
    }
}