/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.pattern;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.TestUtil;
import org.junit.Ignore;

/**
 * Tests {@link PatternReplaceCharFilter}
 */
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
  public void testFailingDot() throws IOException {
    checkOutput(
        "A. .B.", "\\.[\\s]*", ".",
        "A..B.",
        "A..B.");
  }

  public void testLongerReplacement() throws IOException {
    checkOutput(
        "XXabcZZabcYY", "abc", "abcde",
        "XXabcdeZZabcdeYY",
        "XXabcccZZabcccYY");
    checkOutput(
        "XXabcabcYY", "abc", "abcde",
        "XXabcdeabcdeYY",
        "XXabcccabcccYY");
    checkOutput(
        "abcabcYY", "abc", "abcde",
        "abcdeabcdeYY",
        "abcccabcccYY");
    checkOutput(
        "YY", "^", "abcde",
        "abcdeYY",
        // Should be: "-----YY" but we're enforcing non-negative offsets.
        "YYYYYYY");
    checkOutput(
        "YY", "$", "abcde",
        "YYabcde",
        "YYYYYYY");
    checkOutput(
        "XYZ", ".", "abc",
        "abcabcabc",
        "XXXYYYZZZ");
    checkOutput(
        "XYZ", ".", "$0abc",
        "XabcYabcZabc",
        "XXXXYYYYZZZZ");
  }

  public void testShorterReplacement() throws IOException {
    checkOutput(
        "XXabcZZabcYY", "abc", "xy",
        "XXxyZZxyYY",
        "XXabZZabYY");
    checkOutput(
        "XXabcabcYY", "abc", "xy",
        "XXxyxyYY",
        "XXababYY");
    checkOutput(
        "abcabcYY", "abc", "xy",
        "xyxyYY",
        "ababYY");
    checkOutput(
        "abcabcYY", "abc", "",
        "YY",
        "YY");
    checkOutput(
        "YYabcabc", "abc", "",
        "YY",
        "YY");
  }

  private void checkOutput(String input, String pattern, String replacement,
      String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
      CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
        new StringReader(input));

    StringBuilder output = new StringBuilder();
    for (int chr = cs.read(); chr > 0; chr = cs.read()) {
      output.append((char) chr);
    }

    StringBuilder indexMatched = new StringBuilder();
    for (int i = 0; i < output.length(); i++) {
      indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
    }

    boolean outputGood = expectedOutput.equals(output.toString());
    boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());

    if (!outputGood || !indexMatchedGood || false) {
      System.out.println("Pattern : " + pattern);
      System.out.println("Replac. : " + replacement);
      System.out.println("Input   : " + input);
      System.out.println("Output  : " + output);
      System.out.println("Expected: " + expectedOutput);
      System.out.println("Output/i: " + indexMatched);
      System.out.println("Expected: " + expectedIndexMatchedOutput);
      System.out.println();
    }

    assertTrue("Output doesn't match.", outputGood);
    assertTrue("Index-matched output doesn't match.", indexMatchedGood);
  }

  //           1111
  // 01234567890123
  // this is test.
  public void testNothingChange() throws IOException {
    final String BLOCK = "this is test.";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "this", "is", "test." },
        new int[] { 0, 5, 8 },
        new int[] { 4, 7, 13 }, 
        BLOCK.length());
  }
  
  // 012345678
  // aa bb cc
  public void testReplaceByEmpty() throws IOException {
    final String BLOCK = "aa bb cc";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] {});
  }
  
  // 012345678
  // aa bb cc
  // aa#bb#cc
  public void test1block1matchSameLength() throws IOException {
    final String BLOCK = "aa bb cc";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa#bb#cc" },
        new int[] { 0 },
        new int[] { 8 }, 
        BLOCK.length());
  }

  //           11111
  // 012345678901234
  // aa bb cc dd
  // aa##bb###cc dd
  public void test1block1matchLonger() throws IOException {
    final String BLOCK = "aa bb cc dd";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa##bb###cc", "dd" },
        new int[] { 0, 9 },
        new int[] { 8, 11 },
        BLOCK.length());
  }

  // 01234567
  //  a  a
  //  aa  aa
  public void test1block2matchLonger() throws IOException {
    final String BLOCK = " a  a";
    CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa", "aa" },
        new int[] { 1, 4 },
        new int[] { 2, 5 },
        BLOCK.length());
  }

  //           11111
  // 012345678901234
  // aa  bb   cc dd
  // aa#bb dd
  public void test1block1matchShorter() throws IOException {
    final String BLOCK = "aa  bb   cc dd";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa#bb", "dd" },
        new int[] { 0, 12 },
        new int[] { 11, 14 },
        BLOCK.length());
  }

  //           111111111122222222223333
  // 0123456789012345678901234567890123
  //   aa bb cc --- aa bb aa   bb   cc
  //   aa  bb  cc --- aa bb aa  bb  cc
  public void test1blockMultiMatches() throws IOException {
    final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1  $2  $3",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
        new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
        new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
        BLOCK.length());
  }

  //           11111111112222222222333333333
  // 012345678901234567890123456789012345678
  //   aa bb cc --- aa bb aa. bb aa   bb cc
  //   aa##bb cc --- aa##bb aa. bb aa##bb cc

  //   aa bb cc --- aa bbbaa. bb aa   b cc
  
  public void test2blocksMultiMatches() throws IOException {
    final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";

    CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
          new StringReader( BLOCK ) );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
        new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
        new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
        BLOCK.length());
  }

  //           11111111112222222222333333333
  // 012345678901234567890123456789012345678
  //  a bb - ccc . --- bb a . ccc ccc bb
  //  aa b - c . --- b aa . c c b
  public void testChain() throws IOException {
    final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
    CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
        new StringReader( BLOCK ) );
    cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
    cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts,
        new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
        new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
        new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
        BLOCK.length());
  }
  
  private Pattern pattern( String p ){
    return Pattern.compile( p );
  }

  /**
   * A demonstration of how backtracking regular expressions can lead to relatively 
   * easy DoS attacks.
   * 
   * @see <a href="http://swtch.com/~rsc/regexp/regexp1.html">"http://swtch.com/~rsc/regexp/regexp1.html"</a>
   */
  @Ignore
  public void testNastyPattern() throws Exception {
    Pattern p = Pattern.compile("(c.+)*xy");
    String input = "[;<!--aecbbaa--><    febcfdc fbb = \"fbeeebff\" fc = dd   >\\';<eefceceaa e= babae\" eacbaff =\"fcfaccacd\" = bcced>>><  bccaafe edb = ecfccdff\"   <?</script><    edbd ebbcd=\"faacfcc\" aeca= bedbc ceeaac =adeafde aadccdaf = \"afcc ffda=aafbe &#x16921ed5\"1843785582']";
    for (int i = 0; i < input.length(); i++) {
      Matcher matcher = p.matcher(input.substring(0, i));
      long t = System.currentTimeMillis();
      if (matcher.find()) {
        System.out.println(matcher.group());
      }
      System.out.println(i + " > " + (System.currentTimeMillis() - t) / 1000.0);
    }
  }

  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    int numPatterns = 5;
    Random random = new Random(random().nextLong());
    for (int i = 0; i < numPatterns; i++) {
      final Pattern p = TestUtil.randomPattern(random());

      final String replacement = TestUtil.randomSimpleString(random);
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
          return new TokenStreamComponents(tokenizer, tokenizer);
        }

        @Override
        protected Reader initReader(String fieldName, Reader reader) {
          return new PatternReplaceCharFilter(p, replacement, reader);
        }
      };

      /* max input length. don't make it longer -- exponential processing
       * time for certain patterns. */ 
      final int maxInputLength = 30;
      /* ASCII only input?: */
      final boolean asciiOnly = true;
      checkRandomData(random, a, 50 * RANDOM_MULTIPLIER, maxInputLength, asciiOnly);
      a.close();
    }
  }
 }