package org.airsonic.player.service.search;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.Test;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

/**
 * Test case for Analyzer.
 * These cases have the purpose of observing the current situation
 * and observing the impact of upgrading Lucene.
 */
public class AnalyzerFactoryTestCase {

    private AnalyzerFactory analyzerFactory = new AnalyzerFactory();

    /**
     * Test for the number of character separators per field.
     */
    @Test
    public void testTokenCounts() {

        /*
         * Analyzer used in legacy uses the same Tokenizer for all fields.
         * (Some fields are converted to their own input string for integrity.)
         * As a result, specifications for strings are scattered and difficult to understand.
         * Using PerFieldAnalyzerWrapper,
         * it is possible to use different Analyzer (Tokenizer/Filter) for each field.
         * This allows consistent management of parsing definitions.
         * It is also possible to apply definitions such as "id3 delimiters Tokenizer" to specific fields.
         */

        // The number of words excluding articles is 7.
        String query = "The quick brown fox jumps over the lazy dog.";

        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> terms = toTermString(n, query);
            switch (n) {

                /*
                 * In the legacy, these field divide input into 7. It is not necessary to delimit
                 * this field originally.
                 */
                case FieldNames.FOLDER:
                case FieldNames.MEDIA_TYPE:
                case FieldNames.GENRE:
                    assertEquals("oneTokenFields : " + n, 7, terms.size());
                    break;

                /*
                 * These should be divided into 7.
                 */
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("multiTokenFields : " + n, 7, terms.size());
                    break;
                /*
                 * ID, FOLDER_ID, YEAR
                 * This is not a problem because the input value does not contain a delimiter.
                 */
                default:
                    assertEquals("oneTokenFields : " + n, 7, terms.size());
                    break;
            }
        });

    }

    /**
     * Detailed tests on Punctuation.
     * In addition to the common delimiters, there are many delimiters.
     */
    @Test
    public void testPunctuation1() {

        String query = "B︴C";
        String expected = "b︴c";

        /*
         * XXX 3.x -> 8.x :
         * The definition of punctuation has changed.
         */
        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> terms = toTermString(n, query);
            switch (n) {

                /*
                 * In the legacy, these field divide input into 2.
                 * It is not necessary to delimit
                 * this field originally.
                 */
                case FieldNames.FOLDER:
                case FieldNames.GENRE:
                case FieldNames.MEDIA_TYPE:
                    assertEquals("tokenized : " + n, 1, terms.size());
                    assertEquals("tokenized : " + n, expected, terms.get(0));
                    break;

                /*
                 * What should the fields of this be?
                 * Generally discarded.
                 */
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("tokenized : " + n, 1, terms.size());
                    assertEquals("tokenized : " + n, expected, terms.get(0));
                    break;
                /*
                 * ID, FOLDER_ID, YEAR
                 * This is not a problem because the input value does not contain a delimiter.
                 */
                default:
                    assertEquals("tokenized : " + n, 2, terms.size());
                    break;
            }
        });
    }

    /*
     * Detailed tests on Punctuation.
     * Many of the symbols are delimiters or target to be removed.
     */
    @Test
    public void testPunctuation2() {

        String query = "{'“『【【】】[︴○◎@ $〒→+]";
        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> terms = toTermString(n, query);
            switch (n) {
                case FieldNames.FOLDER:
                case FieldNames.MEDIA_TYPE:
                case FieldNames.GENRE:
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("removed : " + n, 0, terms.size());
                    break;
                default:
                    assertEquals("removed : " + n, 0, terms.size());
            }
        });
    }

    /**
     * Detailed tests on Stopward.
     *
     * @see org.apache.lucene.analysis.core.StopAnalyzer#ENGLISH_STOP_WORDS_SET
     */
    @Test
    public void testStopward() {

        /*
         * Legacy behavior is to remove ENGLISH_STOP_WORDS_SET from the Token stream.
         * (Putting whether or not it matches the specification of the music search.)
         */

        /*
         * article.
         * This is included in ENGLISH_STOP_WORDS_SET.
         */
        String queryArticle = "a an the";

        /*
         * The default set as index stop word.
         * But these are not included in ENGLISH_STOP_WORDS_SET.
         */
        String queryArticle4Index = "el la los las le les";

        /*
         * Non-article in the ENGLISH_STOP_WORDS_SET.
         * Stopwords are essential for newspapers and documents,
         * but offten they are over-processed for song titles.
         * For example, "we will rock you" can not be searched by "will".
         */
        String queryStop = "and are as at be but by for if in into is it no not of on " //
                + "or such that their then there these they this to was will with";

        /*
         * Unique conjunctions often used in artist fields.
         */
        String stopwordsForArtist = "by cv feat vs with";

        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> articleTerms = toTermString(n, queryArticle);
            List<String> indexArticleTerms = toTermString(n, queryArticle4Index);
            List<String> stopedTerms = toTermString(n, queryStop);
            List<String> artistTerms = toTermString(n, stopwordsForArtist);

            switch (n) {

                case FieldNames.ALBUM:
                case FieldNames.TITLE:

                    // Deleted because it is a stopword.
                    assertEquals("article : " + n, 0, articleTerms.size());
                    // "la los" is not deleted(#1235).
                    assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
                    // Not deleted because it is not a stopword.
                    assertEquals("non-article stop words : " + n, 30, stopedTerms.size());
                    // Not deleted because it is not a stopword.
                    assertEquals("stop words for artsist : " + n, 5, artistTerms.size());
                    break;

                case FieldNames.ARTIST:

                    // Deleted because it is a stopword.
                    assertEquals("article : " + n, 0, articleTerms.size());
                    // "la los" is not deleted(#1235).
                    assertEquals("sonic server index article: " + n, 2, indexArticleTerms.size());
                    // Not deleted because it is not a stopword(Except by and with).
                    assertEquals("non-article stop words : " + n, 28, stopedTerms.size());
                    // Deleted because it is a stopword.
                    assertEquals("stop words for artsist : " + n, 0, artistTerms.size());
                    break;

                default:
                    fail(); // no analyze field is not applicable
                    break;
            }
        });

    }

    /**
     * Simple test on FullWidth.
     */
    @Test
    public void testFullWidth() {
        String query = "FULL-WIDTH";
        List<String> terms = toTermString(query);
        assertEquals(2, terms.size());
        assertEquals("full", terms.get(0));
        assertEquals("width", terms.get(1));
    }

    /**
     * Combined case of Stop and full-width.
     */
    @Test
    public void testStopwardAndFullWidth() {

        /*
         * This and is not deleted because they are different from the default stopword.
         */
        String queryHalfWidth = "THIS IS FULL-WIDTH SENTENCES.";
        List<String> terms = toTermString(queryHalfWidth);
        assertEquals(5, terms.size());
        assertEquals("this", terms.get(0));
        assertEquals("is", terms.get(1));
        assertEquals("full", terms.get(2));
        assertEquals("width", terms.get(3));
        assertEquals("sentences", terms.get(4));

        /*
         * Legacy can avoid Stopward if it is full width.
         * It is unclear whether it is a specification or not.
         * (Problems due to a defect in filter application order?
         * or
         * Is it popular in English speaking countries?)
         */
        String queryFullWidth = "THIS IS FULL-WIDTH SENTENCES.";
        terms = toTermString(queryFullWidth);
        /*
         * XXX 3.x -> 8.x :
         *
         * This is not a change due to the library but an intentional change.
         * The filter order has been changed properly
         * as it is probably not a deliberate specification.
         */
        assertEquals(5, terms.size());
        assertEquals("this", terms.get(0));
        assertEquals("is", terms.get(1));
        assertEquals("full", terms.get(2));
        assertEquals("width", terms.get(3));
        assertEquals("sentences", terms.get(4));

    }

    /**
     * Tests on ligature and diacritical marks.
     * In UAX#29, determination of non-practical word boundaries is not considered.
     * Languages ​​that use special strings require "practical word" sample.
     * Unit testing with only ligature and diacritical marks is not possible.
     */
    @Test
    public void testAsciiFoldingStop() {

        String queryLigature = "Cæsar";
        String expectedLigature = "caesar";

        String queryDiacritical = "Café";
        String expectedDiacritical = "cafe";

        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> termsLigature = toTermString(n, queryLigature);
            List<String> termsDiacritical = toTermString(n, queryDiacritical);
            switch (n) {

                /*
                 * It is decomposed into the expected string.
                 */
                case FieldNames.FOLDER:
                case FieldNames.MEDIA_TYPE:
                case FieldNames.GENRE:
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("Cæsar : " + n, 1, termsLigature.size());
                    assertEquals("Cæsar : " + n, expectedLigature, termsLigature.get(0));
                    assertEquals("Café : " + n, 1, termsDiacritical.size());
                    assertEquals("Café : " + n, expectedDiacritical, termsDiacritical.get(0));
                    break;

                // Legacy has common behavior for all fields.
                default:
                    assertEquals("Cæsar : " + n, 1, termsLigature.size());
                    assertEquals("Cæsar : " + n, expectedLigature, termsLigature.get(0));
                    assertEquals("Café : " + n, 1, termsDiacritical.size());
                    assertEquals("Café : " + n, expectedDiacritical, termsDiacritical.get(0));
                    break;

            }
        });

    }

    /**
     * Detailed tests on LowerCase.
     */
    @Test
    public void testLowerCase() {

        // Filter operation check only. Verify only some settings.
        String query = "ABCDEFG";
        String expected = "abcdefg";

        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> terms = toTermString(n, query);
            switch (n) {

                /*
                 * In legacy, it is converted to lower. (over-processed?)
                 */
                case FieldNames.FOLDER:
                case FieldNames.MEDIA_TYPE:
                    assertEquals("lower : " + n, 1, terms.size());
                    assertEquals("lower : " + n, expected, terms.get(0));
                    break;

                /*
                 * These are searchable fields in lower case.
                 */
                case FieldNames.GENRE:
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("lower : " + n, 1, terms.size());
                    assertEquals("lower : " + n, expected, terms.get(0));
                    break;

                // Legacy has common behavior for all fields.
                default:
                    assertEquals("lower : " + n, 1, terms.size());
                    assertEquals("lower : " + n, expected, terms.get(0));
                    break;

            }
        });
    }

    /**
     * Detailed tests on EscapeRequires.
     * The reserved string is discarded unless it is purposely Escape.
     * This is fine as a search specification(if it is considered as a kind of reserved stop word).
     * However, in the case of file path, it may be a problem.
     */
    @Test
    public void testLuceneEscapeRequires() {

        String queryEscapeRequires = "+-&&||!(){}[]^\"~*?:\\/";
        String queryFileUsable = "+-&&!(){}[]^~";

        Arrays.stream(IndexType.values()).flatMap(i -> Arrays.stream(i.getFields())).forEach(n -> {
            List<String> terms = toTermString(n, queryEscapeRequires);
            switch (n) {

                /*
                 * Will be removed. (Can not distinguish the directory of a particular pattern?)
                 */
                case FieldNames.FOLDER:
                    assertEquals("escape : " + n, 0, terms.size());
                    terms = toTermString(n, queryFileUsable);
                    assertEquals("escape : " + n, 0, terms.size());
                    break;

                /*
                 * Will be removed.
                 */
                case FieldNames.MEDIA_TYPE:
                case FieldNames.GENRE:
                case FieldNames.ARTIST:
                case FieldNames.ALBUM:
                case FieldNames.TITLE:
                    assertEquals("escape : " + n, 0, terms.size());
                    break;

                // Will be removed.
                default:
                    assertEquals("escape : " + n, 0, terms.size());
                    break;

            }
        });

    }

    /**
     * Create an example that makes UAX 29 differences easy to understand.
     */
    @Test
    public void testUax29() {

        /*
         * Case using test resource name
         */

        // Semicolon, comma and hyphen.
        String query = "Bach: Goldberg Variations, BWV 988 - Aria";
        List<String> terms = toTermString(query);
        assertEquals(6, terms.size());
        assertEquals("bach", terms.get(0));
        assertEquals("goldberg", terms.get(1));
        assertEquals("variations", terms.get(2));
        assertEquals("bwv", terms.get(3));
        assertEquals("988", terms.get(4));
        assertEquals("aria", terms.get(5));

        // Underscores around words, ascii and semicolon.
        query = "_ID3_ARTIST_ Céline Frisch: Café Zimmermann";
        terms = toTermString(query);
        assertEquals(5, terms.size());

        /*
         * XXX 3.x -> 8.x : _id3_artist_ in UAX#29.
         * Since the effect is large, trim with Filter.
         */
        assertEquals("id3_artist", terms.get(0));
        assertEquals("celine", terms.get(1));
        assertEquals("frisch", terms.get(2));
        assertEquals("cafe", terms.get(3));
        assertEquals("zimmermann", terms.get(4));

        // Underscores around words and slashes.
        query = "_ID3_ARTIST_ Sarah Walker/Nash Ensemble";
        terms = toTermString(query);
        assertEquals(5, terms.size());

        /*
         * XXX 3.x -> 8.x : _id3_artist_ in UAX#29.
         * Since the effect is large, trim with Filter.
         */
        assertEquals("id3_artist", terms.get(0));
        assertEquals("sarah", terms.get(1));
        assertEquals("walker", terms.get(2));
        assertEquals("nash", terms.get(3));
        assertEquals("ensemble", terms.get(4));

        // Space
        assertEquals(asList("abc", "def"), toTermString(" ABC DEF "));
        assertEquals(asList("abc1", "def"), toTermString(" ABC1 DEF "));

        // trim and delimiter
        assertEquals(asList("abc", "def"), toTermString("+ABC+DEF+"));
        assertEquals(asList("abc", "def"), toTermString("|ABC|DEF|"));
        assertEquals(asList("abc", "def"), toTermString("!ABC!DEF!"));
        assertEquals(asList("abc", "def"), toTermString("(ABC(DEF("));
        assertEquals(asList("abc", "def"), toTermString(")ABC)DEF)"));
        assertEquals(asList("abc", "def"), toTermString("{ABC{DEF{"));
        assertEquals(asList("abc", "def"), toTermString("}ABC}DEF}"));
        assertEquals(asList("abc", "def"), toTermString("[ABC[DEF["));
        assertEquals(asList("abc", "def"), toTermString("]ABC]DEF]"));
        assertEquals(asList("abc", "def"), toTermString("^ABC^DEF^"));
        assertEquals(asList("abc", "def"), toTermString("\\ABC\\DEF\\"));
        assertEquals(asList("abc", "def"), toTermString("\"ABC\"DEF\""));
        assertEquals(asList("abc", "def"), toTermString("~ABC~DEF~"));
        assertEquals(asList("abc", "def"), toTermString("*ABC*DEF*"));
        assertEquals(asList("abc", "def"), toTermString("?ABC?DEF?"));
        assertEquals(asList("abc:def"), toTermString(":ABC:DEF:"));             // XXX 3.x -> 8.x : abc def -> abc:def
        assertEquals(asList("abc", "def"), toTermString("-ABC-DEF-"));
        assertEquals(asList("abc", "def"), toTermString("/ABC/DEF/"));
        /*
         * XXX 3.x -> 8.x : _abc_def_ in UAX#29.
         * Since the effect is large, trim with Filter.
         */
        assertEquals(asList("abc_def"), toTermString("_ABC_DEF_"));             // XXX 3.x -> 8.x : abc def -> abc_def
        assertEquals(asList("abc", "def"), toTermString(",ABC,DEF,"));
        assertEquals(asList("abc.def"), toTermString(".ABC.DEF."));
        assertEquals(asList("abc", "def"), toTermString("&ABC&DEF&"));          // XXX 3.x -> 8.x : abc&def -> abc def
        assertEquals(asList("abc", "def"), toTermString("@ABC@DEF@"));          // XXX 3.x -> 8.x : abc@def -> abc def
        assertEquals(asList("abc'def"), toTermString("'ABC'DEF'"));

        // trim and delimiter and number
        assertEquals(asList("abc1", "def"), toTermString("+ABC1+DEF+"));
        assertEquals(asList("abc1", "def"), toTermString("|ABC1|DEF|"));
        assertEquals(asList("abc1", "def"), toTermString("!ABC1!DEF!"));
        assertEquals(asList("abc1", "def"), toTermString("(ABC1(DEF("));
        assertEquals(asList("abc1", "def"), toTermString(")ABC1)DEF)"));
        assertEquals(asList("abc1", "def"), toTermString("{ABC1{DEF{"));
        assertEquals(asList("abc1", "def"), toTermString("}ABC1}DEF}"));
        assertEquals(asList("abc1", "def"), toTermString("[ABC1[DEF["));
        assertEquals(asList("abc1", "def"), toTermString("]ABC1]DEF]"));
        assertEquals(asList("abc1", "def"), toTermString("^ABC1^DEF^"));
        assertEquals(asList("abc1", "def"), toTermString("\\ABC1\\DEF\\"));
        assertEquals(asList("abc1", "def"), toTermString("\"ABC1\"DEF\""));
        assertEquals(asList("abc1", "def"), toTermString("~ABC1~DEF~"));
        assertEquals(asList("abc1", "def"), toTermString("*ABC1*DEF*"));
        assertEquals(asList("abc1", "def"), toTermString("?ABC1?DEF?"));
        assertEquals(asList("abc1", "def"), toTermString(":ABC1:DEF:"));
        assertEquals(asList("abc1", "def"), toTermString(",ABC1,DEF,"));        // XXX 3.x -> 8.x : abc1,def -> abc1 def
        assertEquals(asList("abc1", "def"), toTermString("-ABC1-DEF-"));        // XXX 3.x -> 8.x : abc1-def -> abc1 def
        assertEquals(asList("abc1", "def"), toTermString("/ABC1/DEF/"));        // XXX 3.x -> 8.x : abc1/def -> abc1 def
        /*
         * XXX 3.x -> 8.x : _abc1_def_ in UAX#29.
         * Since the effect is large, trim with Filter.
         */
        assertEquals(asList("abc1_def"), toTermString("_ABC1_DEF_"));
        assertEquals(asList("abc1", "def"), toTermString(".ABC1.DEF."));        // XXX 3.x -> 8.x : abc1.def -> abc1 def
        assertEquals(asList("abc1", "def"), toTermString("&ABC1&DEF&"));
        assertEquals(asList("abc1", "def"), toTermString("@ABC1@DEF@"));
        assertEquals(asList("abc1", "def"), toTermString("'ABC1'DEF'"));

    }

    /**
     * Special handling of single quotes.
     */
    @Test
    public void testSingleQuotes() {

        /*
         * A somewhat cultural that seems to be related to a specific language.
         */
        String query = "This is Airsonic's analysis.";
        List<String> terms = toTermString(query);
        assertEquals(4, terms.size());
        assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
        assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
        assertEquals("airsonic", terms.get(2));
        assertEquals("analysis", terms.get(3));

        /*
         * XXX 3.x -> 8.x :
         * we ve -> we've
         */
        query = "We’ve been here before.";
        terms = toTermString(query);
        assertEquals(4, terms.size());
        assertEquals("we've", terms.get(0));
        assertEquals("been", terms.get(1));
        assertEquals("here", terms.get(2));
        assertEquals("before", terms.get(3));

        query = "LʼHomme";
        terms = toTermString(query);
        assertEquals(1, terms.size());
        assertEquals("lʼhomme", terms.get(0));

        query = "L'Homme";
        terms = toTermString(query);
        assertEquals(1, terms.size());
        assertEquals("l'homme", terms.get(0));

        query = "aujourd'hui";
        terms = toTermString(query);
        assertEquals(1, terms.size());
        assertEquals("aujourd'hui", terms.get(0));

        query = "fo'c'sle";
        terms = toTermString(query);
        assertEquals(1, terms.size());
        assertEquals("fo'c'sle", terms.get(0));

    }

    /*
     * There is also a filter that converts the tense to correspond to the search by the present
     * tense.
     */
    @Test
    public void testPastParticiple() {

        /*
         * Confirming no conversion to present tense.
         */
        String query = "This is formed with a form of the verb \"have\" and a past participl.";
        List<String> terms = toTermString(query);
        assertEquals(11, terms.size());
        assertEquals("this", terms.get(0));// Not deleted because it is not a stopword
        assertEquals("is", terms.get(1));// Not deleted because it is not a stopword
        assertEquals("formed", terms.get(2));// leave passive / not "form"
        assertEquals("with", terms.get(3));// Not deleted because it is not a stopword
        assertEquals("form", terms.get(4));
        assertEquals("of", terms.get(5));
        assertEquals("verb", terms.get(6));
        assertEquals("have", terms.get(7));
        assertEquals("and", terms.get(8));// Not deleted because it is not a stopword
        assertEquals("past", terms.get(9));
        assertEquals("participl", terms.get(10));

    }

    /*
     * There are also filters that convert plurals to singular.
     */
    @Test
    public void testNumeral() {

        /*
         * Confirming no conversion to singular.
         */

        String query = "books boxes cities leaves men glasses";
        List<String> terms = toTermString(query);
        assertEquals(6, terms.size());
        assertEquals("books", terms.get(0));// leave numeral / not singular
        assertEquals("boxes", terms.get(1));
        assertEquals("cities", terms.get(2));
        assertEquals("leaves", terms.get(3));
        assertEquals("men", terms.get(4));
        assertEquals("glasses", terms.get(5));
    }

    @Test
    public void testGenre() {

        /*
         * Confirming no conversion to singular.
         */

        String query = "{}";
        List<String> terms = toQueryTermString(FieldNames.GENRE, query);
        assertEquals(1, terms.size());
        assertEquals("{ }", terms.get(0));
    }

    private List<String> toTermString(String str) {
        return toTermString(null, str);
    }

    private List<String> toTermString(String field, String str) {
        List<String> result = new ArrayList<>();
        try {
            TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                    new StringReader(str));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString()
                        .replaceAll("^term\\=", ""));
            }
            stream.close();
        } catch (IOException e) {
            LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                    .error("Error during Token processing.", e);
        }
        return result;
    }

    /*
     * Should be added in later versions.
     */
    public void testWildCard() {
    }

    @SuppressWarnings("unused")
    private List<String> toQueryTermString(String field, String str) {
        List<String> result = new ArrayList<>();
        try {
            TokenStream stream = analyzerFactory.getQueryAnalyzer().tokenStream(field,
                    new StringReader(str));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString()
                        .replaceAll("^term\\=", ""));
            }
            stream.close();
        } catch (IOException e) {
            LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                    .error("Error during Token processing.", e);
        }
        return result;
    }

}