/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.apex.malhar.lib.io.fs;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestWatcher;
import org.junit.runner.Description;

import org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator;
import org.apache.apex.malhar.lib.io.fs.AbstractFileInputOperator.DirectoryScanner;
import org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl;
import org.apache.apex.malhar.lib.testbench.CollectorTestSink;
import org.apache.apex.malhar.lib.util.TestUtils;
import org.apache.apex.malhar.lib.wal.FSWindowDataManager;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import com.datatorrent.api.Attribute;
import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultPartition;
import com.datatorrent.api.Operator;
import com.datatorrent.api.Partitioner.Partition;

import static org.apache.apex.malhar.lib.helper.OperatorContextTestHelper.mockOperatorContext;

import com.datatorrent.api.Sink;
import com.datatorrent.api.StatsListener;

public class AbstractFileInputOperatorTest
{
  public static class TestMeta extends TestWatcher
  {
    public String dir = null;
    Context.OperatorContext context;

    @Override
    protected void starting(org.junit.runner.Description description)
    {
      TestUtils.deleteTargetTestClassFolder(description);
      String methodName = description.getMethodName();
      String className = description.getClassName();
      this.dir = "target/" + className + "/" + methodName;
      Attribute.AttributeMap attributes = new Attribute.AttributeMap.DefaultAttributeMap();
      attributes.put(Context.DAGContext.APPLICATION_PATH, dir);
      context = mockOperatorContext(1, attributes);
    }

    @Override
    protected void finished(Description description)
    {
      TestUtils.deleteTargetTestClassFolder(description);
    }
  }

  @Rule
  public TestMeta testMeta = new TestMeta();

  @Test
  public void testSinglePartitonRecursive() throws Exception
  {
    checkSubDir(true);
  }

  @Test
  public void testSinglePartiton() throws Exception
  {
    checkSubDir(false);
  }

  private void checkSubDir(boolean recursive) throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    HashSet<String> allLines = Sets.newHashSet();
    String subdir = "";
    for (int file = 0; file < 2; file++) {
      subdir += String.format("/depth_%d", file);
      HashSet<String> lines = Sets.newHashSet();
      for (int line = 0; line < 2; line++) {
        lines.add("f" + file + "l" + line);
      }
      allLines.addAll(lines);
      FileUtils.write(new File(testMeta.dir + subdir, "file" + file), StringUtils.join(lines, '\n'));
    }

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp("((?!target).)*file[\\d]");
    oper.getScanner().setRecursive(recursive);

    oper.setup(testMeta.context);
    for (long wid = 0; wid < 3; wid++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
    }
    oper.teardown();

    int expectedNumTuples = 4;
    if (!recursive) {
      allLines = new HashSet<String>();
      expectedNumTuples = 0;
    }
    Assert.assertEquals("number tuples", expectedNumTuples, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines, new HashSet<String>(queryResults.collectedTuples));

  }

  public static class LineOperator extends LineByLineFileInputOperator
  {
    Set<String> dirPaths = Sets.newHashSet();

    @Override
    protected void visitDirectory(Path filePath)
    {
      dirPaths.add(Path.getPathWithoutSchemeAndAuthority(filePath).toString());
    }
  }

  @Test
  public void testEmptyDirectory() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    Set<String> dPaths = Sets.newHashSet();
    dPaths.add(new File(testMeta.dir).getCanonicalPath());

    String subdir01 = "/a";
    dPaths.add(new File(testMeta.dir + subdir01).getCanonicalPath());
    FileUtils.forceMkdir((new File(testMeta.dir + subdir01)));

    String subdir02 = "/b";
    dPaths.add(new File(testMeta.dir + subdir02).getCanonicalPath());
    FileUtils.forceMkdir(new File(testMeta.dir + subdir02));

    String subdir03 = subdir02 + "/c";
    dPaths.add(new File(testMeta.dir + subdir03).getCanonicalPath());
    FileUtils.forceMkdir(new File(testMeta.dir + subdir03));

    String subdir04 = "/d";
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
      lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir + subdir04, "file0");
    dPaths.add(new File(testMeta.dir + subdir04).getCanonicalPath());
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));

    LineOperator oper = new LineOperator();
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setScanIntervalMillis(0);

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    int wid = 0;

    // Read all records to populate processedList in operator.
    oper.setup(testMeta.context);
    for (int i = 0; i < 3; i++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
      wid++;
    }

    Assert.assertEquals("Size", 5, oper.dirPaths.size());
    Assert.assertTrue("Checking Sets", dPaths.equals(oper.dirPaths));
  }

  @Test
  public void testScannerPartitioning() throws Exception
  {
    DirectoryScanner scanner = new DirectoryScanner();
    scanner.setFilePatternRegexp(".*partition([\\d]*)");

    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }

    FileSystem fs = FileSystem.get(FileContext.getLocalFSFileContext().getDefaultFileSystem().getUri(), new Configuration());
    List<DirectoryScanner> partitions = scanner.partition(2);
    Set<Path> allFiles = Sets.newHashSet();
    for (DirectoryScanner partition : partitions) {
      Set<Path> files = partition.scan(fs, path, Sets.<String>newHashSet());
      Assert.assertEquals("", 3, files.size());
      allFiles.addAll(files);
    }
    Assert.assertEquals("Found all files " + allFiles, 5, allFiles.size());

  }

  @Test
  public void testPartitioning() throws Exception
  {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());

    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }

    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions,
        new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, oper.getCurrentPartitions()); // partitioned() wasn't called

    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      Assert.assertNotSame(oper, p.getPartitionedInstance());
      Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
      Set<String> consumed = Sets.newHashSet();
      LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
      Assert.assertEquals("partition " + files, 3, files.size());
    }
  }

  /**
   * Test for testing dynamic partitioning.
   * - Create 4 file with 3 records each.
   * - Create a single partition, and read all records, populating pending files in operator.
   * - Split it in two operators
   * - Try to emit records again, expected result is no record is emitted, as all files are
   *   processed.
   * - Create another 4 files with 3 records each
   * - Try to emit records again, expected result total record emitted 4 * 3 = 12.
   */
  @Test
  public void testPartitioningStateTransfer() throws Exception
  {

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setScanIntervalMillis(0);

    LineByLineFileInputOperator initialState = new Kryo().copy(oper);

    // Create 4 files with 3 records each.
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    int file;
    for (file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
    }

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    int wid = 0;

    // Read all records to populate processedList in operator.
    oper.setup(testMeta.context);
    for (int i = 0; i < 10; i++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
      wid++;
    }
    Assert.assertEquals("All tuples read ", 12, sink.collectedTuples.size());

    Assert.assertEquals(1, initialState.getCurrentPartitions());
    initialState.setPartitionCount(2);
    StatsListener.Response rsp = initialState.processStats(null);
    Assert.assertEquals(true, rsp.repartitionRequired);

    // Create partitions of the operator.
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    // incremental capacity controlled partitionCount property
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = initialState.definePartitions(partitions,
        new PartitioningContextImpl(null, 0));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, initialState.getCurrentPartitions());
    Map<Integer, Partition<AbstractFileInputOperator<String>>> m = Maps.newHashMap();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      m.put(m.size(), p);
    }
    initialState.partitioned(m);
    Assert.assertEquals(2, initialState.getCurrentPartitions());

    /* Collect all operators in a list */
    List<AbstractFileInputOperator<String>> opers = Lists.newArrayList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      LineByLineFileInputOperator oi = (LineByLineFileInputOperator)p.getPartitionedInstance();
      oi.setup(testMeta.context);
      oi.output.setSink(sink);
      opers.add(oi);
    }

    sink.clear();
    for (int i = 0; i < 10; i++) {
      for (AbstractFileInputOperator<String> o : opers) {
        o.beginWindow(wid);
        o.emitTuples();
        o.endWindow();
      }
      wid++;
    }

    // No record should be read.
    Assert.assertEquals("No new tuples read ", 0, sink.collectedTuples.size());

    // Add four new files with 3 records each.
    for (; file < 8; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
    }

    for (int i = 0; i < 10; i++) {
      for (AbstractFileInputOperator<String> o : opers) {
        o.beginWindow(wid);
        o.emitTuples();
        o.endWindow();
      }
      wid++;
    }

    // If all files are processed only once then number of records emitted should
    // be 12.
    Assert.assertEquals("All tuples read ", 12, sink.collectedTuples.size());
  }

  /**
   * Test for testing dynamic partitioning.
   * - Create 4 file with 3 records each.
   * - Create a single partition, and read some records, populating pending files in operator.
   * - Split it in two operators
   * - Try to emit the remaining records.
   */
  @Test
  public void testPartitioningStateTransferInterrupted() throws Exception
  {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setScanIntervalMillis(0);
    oper.setEmitBatchSize(2);

    LineByLineFileInputOperator initialState = new Kryo().copy(oper);

    // Create 4 files with 3 records each.
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    int file;
    for (file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
    }

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    int wid = 0;

    //Read some records
    oper.setup(testMeta.context);
    for (int i = 0; i < 5; i++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
      wid++;
    }

    Assert.assertEquals("Partial tuples read ", 6, sink.collectedTuples.size());

    Assert.assertEquals(1, initialState.getCurrentPartitions());
    initialState.setPartitionCount(2);
    StatsListener.Response rsp = initialState.processStats(null);
    Assert.assertEquals(true, rsp.repartitionRequired);

    // Create partitions of the operator.
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    // incremental capacity controlled partitionCount property
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = initialState.definePartitions(partitions, new PartitioningContextImpl(null, 0));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, initialState.getCurrentPartitions());
    Map<Integer, Partition<AbstractFileInputOperator<String>>> m = Maps.newHashMap();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      m.put(m.size(), p);
    }
    initialState.partitioned(m);
    Assert.assertEquals(2, initialState.getCurrentPartitions());

    /* Collect all operators in a list */
    List<AbstractFileInputOperator<String>> opers = Lists.newArrayList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      LineByLineFileInputOperator oi = (LineByLineFileInputOperator)p.getPartitionedInstance();
      oi.setup(testMeta.context);
      oi.output.setSink(sink);
      opers.add(oi);
    }

    sink.clear();
    for (int i = 0; i < 10; i++) {
      for (AbstractFileInputOperator<String> o : opers) {
        o.beginWindow(wid);
        o.emitTuples();
        o.endWindow();
      }
      wid++;
    }

    Assert.assertEquals("Remaining tuples read ", 6, sink.collectedTuples.size());
  }

  /**
   * Test for testing dynamic partitioning interrupting ongoing read.
   * - Create 4 file with 3 records each.
   * - Create a single partition, and read some records, populating pending files in operator.
   * - Split it in two operators
   * - Try to emit the remaining records.
   */
  @Test
  public void testPartitioningStateTransferFailure() throws Exception
  {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setScanIntervalMillis(0);
    oper.setEmitBatchSize(2);

    LineByLineFileInputOperator initialState = new Kryo().copy(oper);

    // Create 4 files with 3 records each.
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    int file;
    for (file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "a\nb\nc\n");
    }

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    int wid = 0;

    //Read some records
    oper.setup(testMeta.context);
    for (int i = 0; i < 5; i++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
      wid++;
    }

    Assert.assertEquals("Partial tuples read ", 6, sink.collectedTuples.size());

    Assert.assertEquals(1, initialState.getCurrentPartitions());
    initialState.setPartitionCount(2);
    StatsListener.Response rsp = initialState.processStats(null);
    Assert.assertEquals(true, rsp.repartitionRequired);

    // Create partitions of the operator.
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    // incremental capacity controlled partitionCount property
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = initialState.definePartitions(partitions, new PartitioningContextImpl(null, 0));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, initialState.getCurrentPartitions());
    Map<Integer, Partition<AbstractFileInputOperator<String>>> m = Maps.newHashMap();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      m.put(m.size(), p);
    }
    initialState.partitioned(m);
    Assert.assertEquals(2, initialState.getCurrentPartitions());

    /* Collect all operators in a list */
    List<AbstractFileInputOperator<String>> opers = Lists.newArrayList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      LineByLineFileInputOperator oi = (LineByLineFileInputOperator)p.getPartitionedInstance();
      oi.setup(testMeta.context);
      oi.output.setSink(sink);
      opers.add(oi);
    }

    sink.clear();
    for (int i = 0; i < 10; i++) {
      for (AbstractFileInputOperator<String> o : opers) {
        o.beginWindow(wid);
        o.emitTuples();
        o.endWindow();
      }
      wid++;
    }

    // No record should be read.
    Assert.assertEquals("Remaining tuples read ", 6, sink.collectedTuples.size());
  }

  @Test
  public void testRecoveryWithFailedFile() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
      lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir, "file0");
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));


    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.scanner = null;
    oper.failedFiles.add(new AbstractFileInputOperator.FailedFile(testFile.getAbsolutePath(), 1));

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({ "unchecked", "rawtypes" })
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);

    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();

    oper.teardown();

    Assert.assertEquals("number tuples", 4, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines.subList(1, allLines.size()), new ArrayList<String>(queryResults.collectedTuples));
  }

  @Test
  public void testRecoveryWithUnfinishedFile() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
      lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir, "file0");
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.scanner = null;
    oper.unfinishedFiles.add(new AbstractFileInputOperator.FailedFile(testFile.getAbsolutePath(), 2));

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);

    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();

    oper.teardown();

    Assert.assertEquals("number tuples", 3, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines.subList(2, allLines.size()), new ArrayList<String>(queryResults.collectedTuples));
  }

  @Test
  public void testRecoveryWithPendingFile() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
      lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir, "file0");
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.scanner = null;
    oper.pendingFiles.add(testFile.getAbsolutePath());

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);

    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();

    oper.teardown();

    Assert.assertEquals("number tuples", 5, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines, new ArrayList<String>(queryResults.collectedTuples));
  }

  @Test
  public void testRecoveryWithCurrentFile() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
      lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir, "file0");
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.scanner = null;
    oper.currentFile = testFile.getAbsolutePath();
    oper.offset = 1;

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);

    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();

    oper.teardown();

    Assert.assertEquals("number tuples", 4, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines.subList(1, allLines.size()), new ArrayList<String>(queryResults.collectedTuples));
  }

  @Test
  public void testIdempotency() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);

    List<String> allLines = Lists.newArrayList();
    for (int file = 0; file < 2; file++) {
      List<String> lines = Lists.newArrayList();
      for (int line = 0; line < 2; line++) {
        lines.add("f" + file + "l" + line);
      }
      allLines.addAll(lines);
      FileUtils.write(new File(testMeta.dir, "file" + file), StringUtils.join(lines, '\n'));
    }

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");

    oper.setWindowDataManager(manager);

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    TestUtils.setSink(oper.output, queryResults);

    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");

    oper.setup(testMeta.context);
    for (long wid = 0; wid < 3; wid++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
    }
    oper.teardown();
    List<String> beforeRecovery = Lists.newArrayList(queryResults.collectedTuples);

    queryResults.clear();

    //idempotency  part
    oper.setup(testMeta.context);
    for (long wid = 0; wid < 3; wid++) {
      oper.beginWindow(wid);
      oper.endWindow();
    }
    Assert.assertEquals("number tuples", 4, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", beforeRecovery, queryResults.collectedTuples);
    oper.teardown();
  }

  @Test
  public void testIdempotencyWithMultipleEmitTuples() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);

    List<String> allLines = Lists.newArrayList();
    for (int file = 0; file < 2; file++) {
      List<String> lines = Lists.newArrayList();
      for (int line = 0; line < 2; line++) {
        lines.add("f" + file + "l" + line);
      }
      allLines.addAll(lines);
      FileUtils.write(new File(testMeta.dir, "file" + file), StringUtils.join(lines, '\n'));
    }

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");

    oper.setWindowDataManager(manager);

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    TestUtils.setSink(oper.output, queryResults);

    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");

    oper.setup(testMeta.context);
    oper.beginWindow(0);
    for (int i = 0; i < 3; i++) {
      oper.emitTuples();
    }
    oper.endWindow();
    oper.teardown();
    List<String> beforeRecovery = Lists.newArrayList(queryResults.collectedTuples);

    queryResults.clear();

    //idempotency  part
    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.endWindow();
    Assert.assertEquals("number tuples", 4, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", beforeRecovery, queryResults.collectedTuples);
    oper.teardown();
  }

  @Test
  public void testIdempotencyWhenFileContinued() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);

    List<String> lines = Lists.newArrayList();
    for (int line = 0; line < 10; line++) {
      lines.add("l" + line);
    }
    FileUtils.write(new File(testMeta.dir, "file0"), StringUtils.join(lines, '\n'));

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");
    oper.setEmitBatchSize(5);

    oper.setWindowDataManager(manager);

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");

    oper.setup(testMeta.context);
    int offset = 0;
    for (long wid = 0; wid < 3; wid++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
      if (wid > 0) {
        Assert.assertEquals("number tuples", 5, queryResults.collectedTuples.size());
        Assert.assertEquals("lines", lines.subList(offset, offset + 5), queryResults.collectedTuples);
        offset += 5;
      }
      sink.clear();
    }
    oper.teardown();
    sink.clear();

    //idempotency  part
    offset = 0;
    oper.setup(testMeta.context);
    for (long wid = 0; wid < 3; wid++) {
      oper.beginWindow(wid);
      oper.endWindow();
      if (wid > 0) {
        Assert.assertEquals("number tuples", 5, queryResults.collectedTuples.size());
        Assert.assertEquals("lines", lines.subList(offset, offset + 5), queryResults.collectedTuples);
        offset += 5;
      }
      sink.clear();
    }
    oper.teardown();
  }

  @Test
  public void testStateWithIdempotency() throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);

    HashSet<String> allLines = Sets.newHashSet();
    for (int file = 0; file < 3; file++) {
      HashSet<String> lines = Sets.newHashSet();
      for (int line = 0; line < 2; line++) {
        lines.add("f" + file + "l" + line);
      }
      allLines.addAll(lines);
      FileUtils.write(new File(testMeta.dir, "file" + file), StringUtils.join(lines, '\n'));
    }

    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();

    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");

    oper.setWindowDataManager(manager);

    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({"unchecked", "rawtypes"})
    CollectorTestSink<Object> sink = (CollectorTestSink)queryResults;
    oper.output.setSink(sink);

    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");

    oper.setup(testMeta.context);
    for (long wid = 0; wid < 4; wid++) {
      oper.beginWindow(wid);
      oper.emitTuples();
      oper.endWindow();
    }
    oper.teardown();

    sink.clear();

    //idempotency  part
    oper.pendingFiles.add(new File(testMeta.dir, "file0").getAbsolutePath());
    oper.failedFiles.add(new AbstractFileInputOperator.FailedFile(new File(testMeta.dir, "file1").getAbsolutePath(), 0));
    oper.unfinishedFiles.add(new AbstractFileInputOperator.FailedFile(new File(testMeta.dir, "file2").getAbsolutePath(), 0));

    oper.setup(testMeta.context);
    for (long wid = 0; wid < 4; wid++) {
      oper.beginWindow(wid);
      oper.endWindow();
    }
    Assert.assertTrue("pending state", !oper.pendingFiles.contains("file0"));

    for (AbstractFileInputOperator.FailedFile failedFile : oper.failedFiles) {
      Assert.assertTrue("failed state", !failedFile.path.equals("file1"));
    }

    for (AbstractFileInputOperator.FailedFile unfinishedFile : oper.unfinishedFiles) {
      Assert.assertTrue("unfinished state", !unfinishedFile.path.equals("file2"));
    }
    oper.teardown();
  }

  @Test
  public void testIdempotencyWithCheckPoint() throws Exception
  {
    testIdempotencyWithCheckPoint(new LineByLineFileInputOperator(), new CollectorTestSink<String>(), new IdempotencyTestDriver<LineByLineFileInputOperator>()
    {
      @Override
      public void writeFile(int count, String fileName) throws IOException
      {
        List<String> lines = Lists.newArrayList();
        for (int line = 0; line < count; line++) {
          lines.add(fileName + "l" + line);
        }
        FileUtils.write(new File(testMeta.dir, fileName), StringUtils.join(lines, '\n'));
      }

      @Override
      public void setSink(LineByLineFileInputOperator operator, Sink<?> sink)
      {
        TestUtils.setSink(operator.output, sink);
      }

      @Override
      public String getDirectory()
      {
        return testMeta.dir;
      }

      @Override
      public Context.OperatorContext getContext()
      {
        return testMeta.context;
      }
    });
  }

  public interface IdempotencyTestDriver<T extends Operator>
  {
    void writeFile(int count, String fileName) throws IOException;

    void setSink(T operator, Sink<?> sink);

    String  getDirectory();

    Context.OperatorContext getContext();
  }

  public static <S extends AbstractFileInputOperator, T> void testIdempotencyWithCheckPoint(S oper, CollectorTestSink<T> queryResults, IdempotencyTestDriver<S> driver) throws Exception
  {
    FileContext.getLocalFSFileContext().delete(new Path(new File(driver.getDirectory()).getAbsolutePath()), true);

    int file = 0;
    driver.writeFile(5, "file" + file);

    file = 1;
    driver.writeFile(6, "file" + file);

    // empty file
    file = 2;
    driver.writeFile(0, "file" + file);

    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(driver.getDirectory() + "/recovery");

    oper.setWindowDataManager(manager);

    oper.setDirectory(driver.getDirectory());
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");

    oper.setup(driver.getContext());

    oper.setEmitBatchSize(3);

    // sort the pendingFiles and ensure the ordering of the files scanned
    DirectoryScannerNew newScanner = new DirectoryScannerNew();
    oper.setScanner(newScanner);

    // scan directory
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();

    // emit f0l0, f0l1, f0l2
    oper.beginWindow(1);
    oper.emitTuples();
    oper.endWindow();

    //checkpoint the operator
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    S checkPointOper = checkpoint(oper, bos);

    // start saving output
    driver.setSink(oper, queryResults);

    // emit f0l3, f0l4, and closeFile(f0) in the same window
    oper.beginWindow(2);
    oper.emitTuples();
    oper.endWindow();
    List<T> beforeRecovery2 = Lists.newArrayList(queryResults.collectedTuples);

    // emit f1l0, f1l1, f1l2
    oper.beginWindow(3);
    oper.emitTuples();
    oper.endWindow();
    List<T> beforeRecovery3 = Lists.newArrayList(queryResults.collectedTuples);

    // emit f1l3, f1l4, f1l5
    oper.beginWindow(4);
    oper.emitTuples();
    oper.endWindow();
    List<T> beforeRecovery4 = Lists.newArrayList(queryResults.collectedTuples);

    // closeFile(f1) in a new window
    oper.beginWindow(5);
    oper.emitTuples();
    oper.endWindow();
    List<T> beforeRecovery5 = Lists.newArrayList(queryResults.collectedTuples);

    // empty file ops, closeFile(f2) in emitTuples() only
    oper.beginWindow(6);
    oper.emitTuples();
    oper.endWindow();
    List<T> beforeRecovery6 = Lists.newArrayList(queryResults.collectedTuples);

    oper.teardown();

    queryResults.clear();

    //idempotency  part

    oper = restoreCheckPoint(checkPointOper, bos);
    driver.getContext().getAttributes().put(Context.OperatorContext.ACTIVATION_WINDOW_ID, 1L);
    oper.setup(driver.getContext());
    driver.setSink(oper, queryResults);

    long startwid = driver.getContext().getAttributes().get(Context.OperatorContext.ACTIVATION_WINDOW_ID) + 1;

    oper.beginWindow(startwid);
    Assert.assertTrue(oper.currentFile == null);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("lines", beforeRecovery2, queryResults.collectedTuples);

    oper.beginWindow(++startwid);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("lines", beforeRecovery3, queryResults.collectedTuples);

    oper.beginWindow(++startwid);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("lines", beforeRecovery4, queryResults.collectedTuples);

    oper.beginWindow(++startwid);
    Assert.assertTrue(oper.currentFile == null);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("lines", beforeRecovery5, queryResults.collectedTuples);

    oper.beginWindow(++startwid);
    Assert.assertTrue(oper.currentFile == null);
    oper.emitTuples();
    oper.endWindow();
    Assert.assertEquals("lines", beforeRecovery6, queryResults.collectedTuples);

    Assert.assertEquals("number tuples", 8, queryResults.collectedTuples.size());

    oper.teardown();
  }

  /**
   * This method checkpoints the given operator.
   * @param oper The operator to checkpoint.
   * @param bos The ByteArrayOutputStream which saves the checkpoint data temporarily.
   * @return new operator.
   */
  public static <T> T checkpoint(T oper, ByteArrayOutputStream bos) throws Exception
  {
    Kryo kryo = new Kryo();

    Output loutput = new Output(bos);
    kryo.writeObject(loutput, oper);
    loutput.close();

    Input lInput = new Input(bos.toByteArray());
    @SuppressWarnings("unchecked")
    T checkPointedOper = kryo.readObject(lInput, (Class<T>)oper.getClass());
    lInput.close();

    return checkPointedOper;
  }

  /**
   * Restores the checkpointed operator.
   * @param checkPointOper The checkpointed operator.
   * @param bos The ByteArrayOutputStream which saves the checkpoint data temporarily.
   */
  @SuppressWarnings({"unchecked", "rawtypes"})
  public static <T> T restoreCheckPoint(T checkPointOper, ByteArrayOutputStream bos) throws Exception
  {
    Kryo kryo = new Kryo();

    Input lInput = new Input(bos.toByteArray());
    T oper = kryo.readObject(lInput, (Class<T>)checkPointOper.getClass());
    lInput.close();

    return oper;
  }

  @Test
  public void testWindowDataManagerPartitioning() throws Exception
  {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setWindowDataManager(new FSWindowDataManager());
    oper.operatorId = 7;

    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
      FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }

    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));

    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, oper.getCurrentPartitions());

    List<FSWindowDataManager> storageManagers = Lists.newLinkedList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      storageManagers.add((FSWindowDataManager)p.getPartitionedInstance().getWindowDataManager());
    }
    Assert.assertEquals("count of storage managers", 2, storageManagers.size());

    int countOfDeleteManagers = 0;
    FSWindowDataManager deleteManager = null;
    for (FSWindowDataManager storageManager : storageManagers) {
      if (storageManager.getDeletedOperators() != null) {
        countOfDeleteManagers++;
        deleteManager = storageManager;
      }
    }

    Assert.assertEquals("count of delete managers", 1, countOfDeleteManagers);
    Assert.assertNotNull("deleted operators manager", deleteManager);
    Assert.assertEquals("deleted operators", Sets.newHashSet(7), deleteManager.getDeletedOperators());
  }

  /** scanner to extract partition id from start of the filename */
  static class MyScanner extends AbstractFileInputOperator.DirectoryScanner
  {
    @Override
    protected int getPartition(String filePathStr)
    {
      String[] parts = filePathStr.split("/");
      parts = parts[parts.length - 1].split("_");
      try {
        int code = Integer.parseInt(parts[0]);
        return code;
      } catch (NumberFormatException ex) {
        return super.getPartition(filePathStr);
      }
    }
  }

  /**
   * Partition the operator in 2
   * create ten files with index of the file at the start, i.e 1_file, 2_file .. etc.
   * The scanner returns this index from getPartition method.
   * each partition should read 5 files as file index are from 0 to 9 (including 0 and 9).
   * @throws Exception
   */
  @Test
  public void testWithCustomScanner() throws Exception
  {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.setScanner(new MyScanner());
    oper.getScanner().setFilePatternRegexp(".*partition_([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());

    Random rand = new Random();
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 10; file++) {
      FileUtils.write(new File(testMeta.dir, file + "_partition_00" + rand.nextInt(100)), "");
    }

    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions,
        new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, oper.getCurrentPartitions()); // partitioned() wasn't called

    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
      Assert.assertNotSame(oper, p.getPartitionedInstance());
      Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
      Set<String> consumed = Sets.newHashSet();
      LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
      Assert.assertEquals("partition " + files, 6, files.size());
    }
  }

  @Test
  public void testCustomScanner()
  {
    MyScanner scanner = new MyScanner();
    scanner.setPartitionCount(2);

    scanner.setPartitionIndex(1);
    boolean accepted = scanner.acceptFile("1_file");
    Assert.assertTrue("File should be accepted by this partition ", accepted);

    scanner.setPartitionIndex(0);
    accepted = scanner.acceptFile("1_file");
    Assert.assertFalse("File should not be accepted by this partition ", accepted);
  }

  private static class DirectoryScannerNew extends DirectoryScanner
  {
    public LinkedHashSet<Path> scan(FileSystem fs, Path filePath, Set<String> consumedFiles)
    {
      LinkedHashSet<Path> pathSet;
      pathSet = super.scan(fs, filePath, consumedFiles);

      TreeSet<Path> orderFiles = new TreeSet<>();
      orderFiles.addAll(pathSet);
      pathSet.clear();
      Iterator<Path> fileIterator = orderFiles.iterator();
      while (fileIterator.hasNext()) {
        pathSet.add(fileIterator.next());
      }

      return pathSet;
    }
  }
}