java source code of Zemberek2StemFilterFactory

package org.apache.lucene.analysis.tr;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import net.zemberek.erisim.Zemberek;
import net.zemberek.islemler.KelimeKokFrekansKiyaslayici;
import net.zemberek.islemler.cozumleme.CozumlemeSeviyesi;
import net.zemberek.tr.yapi.TurkiyeTurkcesi;
import net.zemberek.yapi.Kelime;
import net.zemberek.yapi.Kok;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;

import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;

/**
 * Factory for {@link Zemberek2StemFilter}.
 */
public class Zemberek2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {

    private static final RootLengthComparator ROOT_LENGTH_COMPARATOR = new RootLengthComparator();
    private static final RootMorphemeComparator ROOT_MORPHEME_COMPARATOR = new RootMorphemeComparator();
    private static final KelimeKokFrekansKiyaslayici FREQUENCY_COMPARATOR = new KelimeKokFrekansKiyaslayici();

    private final Zemberek zemberek = new Zemberek(new TurkiyeTurkcesi());
    private final String strategy;

    public Zemberek2StemFilterFactory(Map<String, String> args) {
        super(args);
        strategy = get(args, "strategy", "maxLength");
        if (!args.isEmpty()) {
            throw new IllegalArgumentException("Unknown parameters: " + args);
        }
    }

    @Override
    public void inform(ResourceLoader loader) throws IOException {
    }


    @Override
    public TokenStream create(TokenStream input) {
        return new Zemberek2StemFilter(input);
    }

    /**
     * Stemmer based on <a href="https://code.google.com/p/zemberek">Zemberek2</a>
     */
    private final class Zemberek2StemFilter extends TokenFilter {

        private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
        private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);

        public Zemberek2StemFilter(TokenStream input) {
            super(input);
        }

        private String stem(Kelime[] cozumler, String aggregation) {

            if ("first".equals(aggregation) || cozumler.length == 1) {
                return cozumler[0].kok().icerik();
            }

            switch (aggregation) {
                case "frequency":
                    Arrays.sort(cozumler, FREQUENCY_COMPARATOR);
                    return cozumler[0].kok().icerik();
                case "maxLength":
                    Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR);
                    return cozumler[0].kok().icerik();
                case "minLength":
                    Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR);
                    return cozumler[cozumler.length - 1].kok().icerik();
                case "maxMorpheme":
                    Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR);
                    return cozumler[0].kok().icerik();
                case "minMorpheme":
                    Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR);
                    return cozumler[cozumler.length - 1].kok().icerik();
                default:
                    throw new RuntimeException("unknown strategy " + aggregation);
            }
        }

        @Override
        public boolean incrementToken() throws IOException {

            if (!input.incrementToken()) return false;
            if (keywordAttribute.isKeyword()) return true;

            final String term = termAttribute.toString();
            final Kelime[] cozumler = zemberek.kelimeCozumle(term, CozumlemeSeviyesi.TUM_KOKLER);
            if (cozumler.length == 0) return true;

            final String s = stem(cozumler, strategy);
            // If not stemmed, don't waste the time adjusting the token.
            if ((s != null) && !s.equals(term))
                termAttribute.setEmpty().append(s);

            return true;
        }
    }

    private static class RootLengthComparator implements Comparator<Kelime> {
        @Override
        public int compare(Kelime o1, Kelime o2) {
            if (o1 == null || o2 == null) return -1;
            final Kok k1 = o1.kok();
            final Kok k2 = o2.kok();
            return k2.icerik().length() - k1.icerik().length();
        }
    }

    private static class RootMorphemeComparator implements Comparator<Kelime> {
        @Override
        public int compare(Kelime o1, Kelime o2) {
            if (o1 == null || o2 == null) return -1;
            return o2.ekler().size() - o1.ekler().size();
        }
    }

    public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
}