java source code of URLTokenFilter

package org.elasticsearch.index.analysis.url;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.index.analysis.URLPart;

import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Joe Linn
 * 1/17/2015
 */
public final class URLTokenFilter extends TokenFilter {
    public static final String NAME = "url";

    private List<URLPart> parts;

    private boolean urlDeocde;

    /**
     * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
     */
    private boolean tokenizeHost = true;

    /**
     * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
     */
    private boolean tokenizePath = true;

    /**
     * If true, the url's query string will be split on &
     */
    private boolean tokenizeQuery = true;

    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
    private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);

    private final boolean allowMalformed;

    private boolean tokenizeMalformed;

    private boolean passthrough;

    private List<Token> tokens;
    private Iterator<Token> iterator;

    public URLTokenFilter(TokenStream input, URLPart part) {
        this(input, part, false);
    }

    public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
        this(input, part, urlDecode, false);
    }

    public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
        this(input, part, urlDecode, allowMalformed, false);
    }

    public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
        super(input);
        if (part != null) {
            this.parts = Collections.singletonList(part);
        } else {
            parts = null;
        }
        this.urlDeocde = urlDecode;
        this.allowMalformed = allowMalformed;
        this.passthrough = passthrough;
    }


    public URLTokenFilter setParts(List<URLPart> parts) {
        this.parts = parts;
        return this;
    }

    public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
        this.tokenizeHost = tokenizeHost;
        return this;
    }

    public URLTokenFilter setTokenizePath(boolean tokenizePath) {
        this.tokenizePath = tokenizePath;
        return this;
    }

    public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
        this.tokenizeQuery = tokenizeQuery;
        return this;
    }


    public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
        this.tokenizeMalformed = tokenizeMalformed;
        return this;
    }

    public URLTokenFilter setUrlDeocde(boolean urlDeocde) {
        this.urlDeocde = urlDeocde;
        return this;
    }


    @Override
    public boolean incrementToken() throws IOException {
        if (iterator == null || !iterator.hasNext()) {
            if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
                return false;
            }
        }
        clearAttributes();
        Token next = iterator.next();
        termAttribute.append(next.getToken());
        typeAttribute.setType(next.getPart().name().toLowerCase());
        offsetAttribute.setOffset(next.getStart(), next.getEnd());
        return true;
    }


    /**
     * Advance to the next token, if any
     * @return true if more tokens are forthcoming, false otherwise
     * @throws IOException
     */
    private boolean advance() throws IOException {
        if (input.incrementToken()) {
            String urlString = termAttribute.toString();
            if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
                return false;
            }
            try {
                tokens = tokenize(urlString);
            } catch (IOException e) {
                if (e.getMessage().contains("Malformed URL")) {
                    if (allowMalformed) {
                        tokens = Collections.singletonList(new Token(urlString, URLPart.WHOLE, 0, urlString.length()));
                    } else {
                        throw new MalformedURLException("Malformed URL: " + urlString);
                    }
                }
                throw e;
            }
            if (tokens.isEmpty()) {
                return false;
            }
            iterator = tokens.iterator();
            return true;
        } else {
            return false;
        }
    }


    /**
     * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
     * will be passed along to the tokenizer.
     * @param input a string to be tokenized
     * @return a list of tokens extracted from the input string
     * @throws IOException
     */
    private List<Token> tokenize(String input) throws IOException {
        List<Token> tokens = new ArrayList<>();
        URLTokenizer tokenizer = new URLTokenizer();
        // create a copy of the parts list to avoid ConcurrentModificationException when sorting
        tokenizer.setParts(new ArrayList<>(parts));
        tokenizer.setUrlDecode(urlDeocde);
        tokenizer.setTokenizeHost(tokenizeHost);
        tokenizer.setTokenizePath(tokenizePath);
        tokenizer.setTokenizeQuery(tokenizeQuery);
        tokenizer.setAllowMalformed(allowMalformed || passthrough);
        tokenizer.setTokenizeMalformed(tokenizeMalformed);
        tokenizer.setReader(new StringReader(input));
        tokenizer.reset();

        String term;
        URLPart part;
        OffsetAttribute offset;
        while (tokenizer.incrementToken()) {
            term = tokenizer.getAttribute(CharTermAttribute.class).toString();
            part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type());
            offset = tokenizer.getAttribute(OffsetAttribute.class);
            tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset()));
        }
        return tokens;
    }


    @Override
    public void reset() throws IOException {
        super.reset();
        tokens = null;
        iterator = null;
    }

    private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
    private static final Pattern REGEX_PORT = Pattern.compile(":([0-9]{1,5})");
    private static final Pattern REGEX_QUERY = Pattern.compile("\\?(.+)");

    /**
     * Attempt to parse a malformed url string
     * @param urlString the malformed url string
     * @return the url part if it can be parsed, null otherwise
     * @deprecated parsing of malformed URLs is now delegated to {@link URLTokenizer}
     */
    private String parseMalformed(String urlString) {
        if (parts != null && !parts.isEmpty()) {
            String ret;
            for (URLPart part : parts) {
                switch (part) {
                    case PROTOCOL:
                        ret = applyPattern(REGEX_PROTOCOL, urlString);
                        break;
                    case PORT:
                        ret = applyPattern(REGEX_PORT, urlString);
                        break;
                    case QUERY:
                        ret = applyPattern(REGEX_QUERY, urlString);
                        break;
                    case WHOLE:
                        ret = urlString;
                        break;
                    default:
                        ret = urlString;
                }
                if (!Strings.isNullOrEmpty(ret)) {
                    return ret;
                }
            }
        }
        return urlString;
    }

    /**
     * Apply the given regex pattern to the given malformed url string and return the first match
     * @param pattern the pattern to match
     * @param urlString the malformed url to which the pattern should be applied
     * @return the first match if one exists, null otherwise
     */
    private String applyPattern(Pattern pattern, String urlString) {
        Matcher matcher = pattern.matcher(urlString);
        if (matcher.find()) {
            return matcher.group(1);
        }
        return null;
    }
}