package edu.stanford.nlp.mt.util;

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.KryoSerializable;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;

import edu.stanford.nlp.mt.util.TimingUtils.TimeKeeper;

/**
 * An implementation of a parallel suffix array.
 * 
 * NOTE: The fields are protected, non-final for fast serialization/deserialization.
 * 
 * @author Spence Green
 *
 */
public class ParallelSuffixArray implements Serializable,KryoSerializable {

  private static final long serialVersionUID = -5403502473957235135L;

  private static final Logger logger = LogManager.getLogger(ParallelSuffixArray.class);
  
  protected int[] srcBitext;
  protected int[] f2e;
  protected int[] tgtBitext;
  protected int[] e2f;
  protected int[] srcSuffixArray; 
  protected int[] tgtSuffixArray;
  
  protected int numSentences;
  protected Vocabulary vocabulary;
  
  // Cache unigram positions in the target for the count() function.
  // The sample function already supports initialization with bounds, which
  // the calling method should maintain.
  protected transient int[] tgtCountLBCache;
  protected transient int[] tgtCountUBCache;
  
  /**
   * No-arg constructor for deserialization.
   */
  public ParallelSuffixArray() {}

  /**
   * Constructor. Careful. This constructor doubles peak memory.
   * 
   * @param corpus
   */
  public ParallelSuffixArray(ParallelCorpus corpus) {
    loadCorpus(corpus);
  }
  
  /**
   * Constructor. Memory-efficient for large files.
   * 
   * @param sourceFile
   * @param targetFile
   * @param alignFile
   * @param expectedSize
   * @throws IOException 
   */
  public ParallelSuffixArray(String sourceFile, String targetFile, String alignFile) throws IOException {
    loadCorpus(sourceFile, targetFile, alignFile);
  }
  

  @Override
  public void write(Kryo kryo, Output output) {
    writeArray(srcBitext, output);
    writeArray(tgtBitext, output);
    writeArray(e2f, output);
    writeArray(f2e, output);
    writeArray(srcSuffixArray, output);
    writeArray(tgtSuffixArray, output);
    output.writeInt(numSentences, true);
    kryo.writeObject(output, vocabulary);
  }

  private static void writeArray(int[] arr, Output output) {
    output.writeInt(arr.length, true);
    output.writeInts(arr, true);
  }

  @Override
  public void read(Kryo kryo, Input input) {
    srcBitext = readArray(input);
    tgtBitext = readArray(input);
    e2f = readArray(input);
    f2e = readArray(input);
    srcSuffixArray = readArray(input);
    tgtSuffixArray = readArray(input);
    numSentences = input.readInt(true);
    vocabulary = kryo.readObject(input, Vocabulary.class);
  }
  
  private static int[] readArray(Input input) {
    int len = input.readInt(true);
    return input.readInts(len, true);
  }

  /**
   * Get the index associated with this suffix array.
   * 
   * @return
   */
  public Vocabulary getVocabulary() { return vocabulary; }
  
  /**
   * Return a stream of the sentence pairs in this bitext.
   * 
   * @return
   */
  public Stream<SentencePair> stream() {
    return IntStream.range(0, srcBitext.length).mapToObj(i -> {
      if (srcBitext[i] < 0) {
        return new SentencePair(i-1);
      } else {
        return null;
      }
    }).filter(o -> o != null);
  }
  
  /**
   * Return a stream of the sentence pairs in this bitext.
   * 
   * @return
   */
  public Stream<SentencePair> parallelStream() {
    return IntStream.range(0, srcBitext.length).parallel().mapToObj(i -> {
      if (srcBitext[i] < 0) {
        return new SentencePair(i-1);
      } else {
        return null;
      }
    }).filter(o -> o != null);
  }
  
  /**
   * Streaming loader, which does not double peak memory like the loader
   * that creates a suffix array from a parallel corpus.
   * 
   * @param source
   * @param target
   * @param align
   * @throws IOException 
   */
  private void loadCorpus(String source, String target, String align) throws IOException {
    logger.info("Counting the number of corpus positions");
    TimeKeeper timer = TimingUtils.start();
    // Read in the files once to count the sentences and corpus positions
    int numSourcePositions = 0;
    int numTargetPositions = 0;
    numSentences = 0;
    ParallelCorpus corpus = new ParallelCorpus(1);
    try (LineNumberReader fReader = IOTools.getReaderFromFile(source)) {
      LineNumberReader eReader = IOTools.getReaderFromFile(target);
      LineNumberReader aReader = IOTools.getReaderFromFile(align);
      for (String fLine; (fLine = fReader.readLine()) != null; ) {
        String eLine = eReader.readLine();
        String aLine = aReader.readLine();
        AlignedSentence example = corpus.getSentence(fLine, eLine, aLine);
        if (example != null) {
          numSourcePositions += example.sourceLength();
          numTargetPositions += example.targetLength();
          ++numSentences;
        }
      }
    }
    final int initialVocabularySize = corpus.getVocabulary().size();
    timer.mark("Counting corpus positions");
    logger.info("Source positions: {}  Target positions: {}  Sentences: {}", numSourcePositions, 
        numTargetPositions, numSentences);
    
    // Create the arrays
    final int srcLength = numSourcePositions + numSentences;
    if (srcLength < 0) throw new RuntimeException("Maximum source bitext size exceeded");
    srcBitext = new int[srcLength];
    f2e = new int[srcLength];
    final int tgtLength = numTargetPositions + numSentences;
    if (tgtLength < 0) throw new RuntimeException("Maximum target bitext size exceeded");
    tgtBitext = new int[tgtLength];
    e2f = new int[tgtLength];
    
    // Create the arrays and read the files again
    try (LineNumberReader fReader = IOTools.getReaderFromFile(source)) {
      LineNumberReader eReader = IOTools.getReaderFromFile(target);
      LineNumberReader aReader = IOTools.getReaderFromFile(align);
      int srcOffset = 0;
      int tgtOffset = 0;
      for (String fLine; (fLine = fReader.readLine()) != null; ) {
        String eLine = eReader.readLine();
        String aLine = aReader.readLine();
        AlignedSentence sentence = corpus.getSentence(fLine, eLine, aLine);
        if (sentence == null) {
          logger.info("Discarding parallel example {}", fReader.getLineNumber());
        } else {
          System.arraycopy(sentence.source, 0, srcBitext, srcOffset, sentence.sourceLength());
          System.arraycopy(sentence.f2e, 0, f2e, srcOffset, sentence.f2e.length);
          System.arraycopy(sentence.target, 0, tgtBitext, tgtOffset, sentence.targetLength());
          System.arraycopy(sentence.e2f, 0, e2f, tgtOffset, sentence.e2f.length);
          srcOffset += sentence.sourceLength();
          tgtOffset += sentence.targetLength();
          // Source points to target
          srcBitext[srcOffset] = toSentenceOffset(tgtOffset);
          // Target points to source
          tgtBitext[tgtOffset] = toSentenceOffset(srcOffset);
          ++srcOffset;
          ++tgtOffset;
        }        
      }
    }
    this.vocabulary = corpus.getVocabulary();
    assert initialVocabularySize == vocabulary.size();
    timer.mark("Loading corpus");
    logger.info("Done loading corpus: {}", timer);
  }
  
  /**
   * Load the parallel corpus into a contiguous block of memory.
   * Set the corpus reference to null after this call to free memory.
   * 
   * @param corpus
   */
  private void loadCorpus(ParallelCorpus corpus) {
    logger.info("Flattening parallel corpus");
    TimeKeeper timer = TimingUtils.start();
    numSentences = corpus.size();
    int numSourcePositions = corpus.numSourcePositions();
    int numTargetPositions = corpus.numTargetPositions();
    int srcLength = numSourcePositions + numSentences;
    srcBitext = new int[srcLength];
    f2e = new int[srcLength];
    int tgtLength = numTargetPositions + numSentences;
    tgtBitext = new int[tgtLength];
    e2f = new int[tgtLength];
    int srcOffset = 0;
    int tgtOffset = 0;
    for (AlignedSentence sentence : corpus) {
      System.arraycopy(sentence.source, 0, srcBitext, srcOffset, sentence.sourceLength());
      System.arraycopy(sentence.f2e, 0, f2e, srcOffset, sentence.f2e.length);
      System.arraycopy(sentence.target, 0, tgtBitext, tgtOffset, sentence.targetLength());
      System.arraycopy(sentence.e2f, 0, e2f, tgtOffset, sentence.e2f.length);
      srcOffset += sentence.sourceLength();
      tgtOffset += sentence.targetLength();
      // Source points to target
      srcBitext[srcOffset] = toSentenceOffset(tgtOffset);
      // Target points to source
      tgtBitext[tgtOffset] = toSentenceOffset(srcOffset);
      ++srcOffset;
      ++tgtOffset;
    }
    vocabulary = corpus.getVocabulary();
    timer.mark("Corpus loading");
    logger.info("Done loading corpus: {}", timer);
  }

  /**
   * Encoding of bitext pointers.
   * 
   * @param corpusPosition
   * @return
   */
  private static int toSentenceOffset(int corpusPosition) {
    return -1 * (corpusPosition + 1);
  }
  
  /**
   * Decoding of bitext pointers.
   * 
   * @param offset
   * @return
   */
  private static int fromSentenceOffset(int offset) {
    return (-1 * offset) - 1;
  }
  
  /**
   * Create suffix arrays for the parallel corpus.
   */
  public void build() {
    logger.info("Building suffix arrays...");
    TimeKeeper timer = TimingUtils.start();
    int numSourcePositions = srcBitext.length - numSentences;
    srcSuffixArray = build(srcBitext, numSourcePositions);
    if (srcSuffixArray.length != numSourcePositions) throw new RuntimeException();
    timer.mark("Source array");
    int numTargetPositions = tgtBitext.length - numSentences;
    tgtSuffixArray = build(tgtBitext, numTargetPositions);
    if (tgtSuffixArray.length != numTargetPositions) throw new RuntimeException();
    timer.mark("Target array");
    logger.info("Done constructing suffix arrays: {}", timer);
  }
  
  /**
   * Sort the bitext in parallel.
   * 
   * @param bitext
   * @param numPositions
   * @return
   */
  private int[] build(final int[] bitext, int numPositions) {
    return IntStream.range(0, bitext.length).parallel().boxed().sorted((x,y) -> {
      // Compare suffixes
      int xPos = x, yPos = y, xId = bitext[x], yId = bitext[y];
      
      // Check to see if these points are sentence boundaries
      if (xId < 0 && yId < 0) {
        return 0;
      } else if (xId < 0) {
        // Say that sentence boundaries are longer than everything else.
        // They will be pushed to the end of the stream so that limit() can filter them.
        return 1;
      } else if (yId < 0) {
        return -1;
      }
            
      while(xId >= 0 && yId >= 0) {
        if (xId == yId) {
          xId = bitext[++xPos];
          yId = bitext[++yPos];
        } else {
          // Lexicographic sort
          return vocabulary.get(xId).compareTo(vocabulary.get(yId));
        }
      }
      
      // Compare lengths
      int xLength = xPos - x + (xId < 0 ? 0 : 1);
      int yLength = yPos - y + (yId < 0 ? 0 : 1);
      return xLength - yLength;
      
    }).limit(numPositions).mapToInt(i -> i).toArray();
  }

  /**
   * Print the suffix array.
   * 
   * @param isSource
   * @param out
   */
  public void print(boolean isSource, PrintWriter out) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    int[] bitext = isSource ? this.srcBitext : this.tgtBitext;
    for (int i = 0; i < sa.length; ++i) {
      StringBuilder sb = new StringBuilder();
      sb.append(i).append(": ");
      for (int corpusPos = sa[i]; bitext[corpusPos] >= 0; ++corpusPos) {
        if (corpusPos != sa[i]) sb.append(" ");
        sb.append(vocabulary.get(bitext[corpusPos]));
      }
      out.println(sb.toString());
    }
    out.flush();
  }
  
  /**
   * Find all source spans up to dimension == 3.
   * 
   * TODO(spenceg) Lopez reports finding a few order=5 n-grams of high frequency
   * so maybe generalize this lookup.
   * 
   * @param sampleSize
   * @param minOccurrences
   */
  public Map<Span,SuffixArraySample> lookupFrequentSourceNgrams(int sampleSize, int minOccurrences) {
    if (sampleSize >= minOccurrences) throw new IllegalArgumentException();
    if (srcSuffixArray.length == 0) return Collections.emptyMap();
    logger.info("Building query cache with threshold {}", minOccurrences);
    Map<Span,SuffixArraySample> queryCache = new HashMap<>(1000);
    int nCnt = 1, nnCnt = 1, nnnCnt = 1;
    int nStart = 0, nnStart = 0, nnnStart = 0;
    Suffix firstSuffix = new Suffix(srcSuffixArray[0], true);
    Span nSpan = new Span(firstSuffix, 1), 
        nnSpan = new Span(firstSuffix, 2), 
        nnnSpan = new Span(firstSuffix, 3);
    for (int i = 1; i < srcSuffixArray.length; ++i) {
      Suffix suffix = new Suffix(srcSuffixArray[i], true);
      Span nSpanThis = new Span(suffix, 1);
      Span nnSpanThis = new Span(suffix, 2);
      Span nnnSpanThis = new Span(suffix, 3);
      nCnt = checkSpan(nSpan, nSpanThis, nStart, i, nCnt, minOccurrences, sampleSize, queryCache);
      if (nCnt == 1) {
        nStart = i;
        nSpan = nSpanThis;
      }
      nnCnt = checkSpan(nnSpan, nnSpanThis, nnStart, i, nnCnt, minOccurrences, sampleSize, queryCache);
      if (nnCnt == 1) {
        nnStart = i;
        nnSpan = nnSpanThis;
      }
      nnnCnt = checkSpan(nnnSpan, nnnSpanThis, nnnStart, i, nnnCnt, minOccurrences, sampleSize, queryCache);
      if (nnnCnt == 1) {
        nnnStart = i;
        nnnSpan = nnnSpanThis;
      }
    };
    logger.info("Query cache size: {}", queryCache.size());
    
    logger.info("Creating target unigram caches for the count() function...");
    this.tgtCountLBCache = new int[vocabulary.size()];
    Arrays.fill(tgtCountLBCache, -1);
    this.tgtCountUBCache = new int[vocabulary.size()];
    Arrays.fill(tgtCountUBCache, -1);
    int lastId = tgtBitext[tgtSuffixArray[0]];
    
    for (int i = 0; i < tgtSuffixArray.length; ++i) {
      int tgtId = tgtBitext[tgtSuffixArray[i]];
      assert tgtId >= 0;
      if (tgtCountLBCache[tgtId] < 0) {
        tgtCountLBCache[tgtId] = i;
      }
      if (lastId != tgtId) {
        tgtCountUBCache[lastId] = i-1;
        assert tgtCountUBCache[lastId] >= tgtCountLBCache[lastId] : String.format("%d %d %d", i, lastId, tgtId);
      }
      lastId = tgtId;
    }
    
    // final update
    tgtCountUBCache[lastId] = tgtSuffixArray.length;
    assert tgtCountUBCache[lastId] >= tgtCountLBCache[lastId] : String.format("%d %d final", tgtSuffixArray.length, lastId);
    
    logger.info("Finished building count() cache.");
    
    return queryCache;
  }
    
  private int checkSpan(Span currentSpan, Span nextSpan, int startSa, int endSa, int cnt, 
      int ruleCacheThreshold, int sampleSize, Map<Span, SuffixArraySample> queryCache) {
    if (currentSpan != null && currentSpan.equals(nextSpan)) {
      return cnt + 1;
      
    } else if (cnt > ruleCacheThreshold) {
      int numHits = endSa - startSa;
      final int stepSize = (numHits < sampleSize) ? 1 : numHits / sampleSize;
      assert stepSize > 0;
      final List<SentencePair> hits = new ArrayList<>(sampleSize);
      for (int i = startSa; i < endSa && hits.size() < sampleSize; i += stepSize) {
        int corpusPosition = srcSuffixArray[i];
        assert srcBitext[corpusPosition] >= 0;
        hits.add(new SentencePair(corpusPosition));
      }
      queryCache.put(currentSpan, new SuffixArraySample(hits, startSa, endSa-1));
    }
    return 1;
  }

  /**
   * Identifies a span for caching.
   * 
   * @author Spence Green
   *
   */
  public class Span {
    public final int[] tokens;
    private final int hashCode;
    private Span(Suffix suffix, int order) {
      int[] tokens = new int[order];
      for (int i = 0; i < order; ++i) {
        int id = suffix.get(i);
        if (id >= 0) {
          tokens[i] = id;
        } else {
          tokens = new int[0];
          break;
        }
      }
      this.tokens = tokens;
      this.hashCode = MurmurHash2.hash32(tokens, tokens.length, 1);
    }
    @Override
    public int hashCode() {
      return hashCode;
    }
    @Override
    public boolean equals(Object o) {
      if (this == o) {
        return true;
      } else if (! (o instanceof Span)) {
        return false;
      } else {
        Span otherSpan = (Span) o;
        return Arrays.equals(this.tokens, otherSpan.tokens);
      }
    }
    @Override
    public String toString() {
      return Arrays.stream(tokens).mapToObj(tokenId -> vocabulary.get(tokenId))
          .collect(Collectors.joining(" "));
    }
  }

  /**
   * The number of segments in the underlying corpus.
   * 
   * @return
   */
  public int numSentences() { return numSentences; }

  public int sourceSASize() { return srcSuffixArray.length; }
  
  public int targetSASize() { return tgtSuffixArray.length; }
  
  /**
   * Find a lower or upper bound in the suffix array.
   * 
   * @param query
   * @param isSource
   * @param lowerBound
   * @param startFrom
   * @return
   */
  private int findBound(final int[] query, boolean isSource, boolean lowerBound, int startFrom) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    return findBound(query, isSource, lowerBound, startFrom, sa.length - 1);
  }
  
  private int findBound(final int[] query, boolean isSource, boolean lowerBound, int lo, int hi) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    int low = lo;
    int high = hi;
    while(low <= high) {
      final int mid = (low + high) >>> 1;
      assert mid < sa.length;
      final int corpusPos = sa[mid];
      assert corpusPos >= 0;
      final Suffix midSuffix = new Suffix(corpusPos, isSource);
      final int cmp = midSuffix.compare(query);

      if (cmp < 0) {
        // Search left
        high = mid - 1;

      } else if (cmp > 0) {
        // Search right
        low = mid + 1;

      } else {
        // Check to see if this is the bound, then search
        if (lowerBound) {
          if (mid == 0) return 0;
          Suffix leftSuffix = new Suffix(sa[mid-1], isSource);
          int cmp2 = leftSuffix.compare(query);
          if (cmp2 > 0) return mid;
          // Search left
          assert cmp2 == 0;
          high = mid - 1;

        } else {
          if (mid == sa.length - 1) return mid;
          Suffix rightSuffix = new Suffix(sa[mid+1], isSource);
          int cmp2 = rightSuffix.compare(query);
          if (cmp2 < 0) return mid;
          // Search right
          assert cmp2 == 0;
          low = mid + 1;
        }
      }
    }
    // Key not found
    return -1;
  }

  /**
   * Wrapper object for suffix queries.
   * 
   * @author Spence Green
   *
   */
  private class Suffix {
    private final int pos;
    private final boolean isSource;
    public Suffix(int corpusPosition, boolean isSource) {
      this.pos = corpusPosition;
      this.isSource = isSource;
    }
    
    public int get(int i) {
      int[] bitext = isSource ? srcBitext : tgtBitext;
      int bitextPos = this.pos + i;
      if (bitextPos < 0 || bitextPos >= bitext.length || bitext[bitextPos] < 0) {
        return -1;
      } else {
        return bitext[bitextPos];
      }
    }

    public int compare(int[] query) {
      int[] bitext = isSource ? srcBitext : tgtBitext;
      boolean consumedQuery = false;
      for (int i = 0, j = pos; i < query.length && bitext[j] >= 0; ++i, ++j) {
        consumedQuery = (i == query.length-1);
        int xId = query[i];
        int yId = bitext[j];
        if (xId != yId) {
          return vocabulary.get(xId).compareTo(vocabulary.get(yId));
        }
      }

      // If query has been consumed, then this query is a prefix of this suffix, and this is a 
      // match. Otherwise, the query is longer than the suffix.
      return consumedQuery ? 0 : 1;
    }
    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      boolean seenEnd = false;
      for (int i = 0; ! seenEnd ; ++i) {
        int vocabId = get(i);
        if (vocabId >= 0) {
          if (i > 0) sb.append(" ");
          sb.append(vocabulary.get(vocabId));
        } else {
          seenEnd = true;
        }
      }
      return sb.toString();
    }
  }

  /**
   * Count of this sequence in either the source or target bitext.
   * 
   * @param tokens
   * @param isSource
   * @return
   */
  public int count(final int[] query, boolean isSource) {
    if (query.length == 0) return 0;
    if (!isSource && this.tgtCountLBCache != null && this.tgtCountUBCache != null) {
      // Use caches for fast target lookup
      final int tgtId = query[0];
      final int lo = tgtCountLBCache[tgtId];
      final int hi = tgtCountUBCache[tgtId];
      if (query.length == 1) {
        int count = hi - lo + 1;
        assert count > 0 : String.format("%d %d %d %d", tgtId, count, lo, hi);
        return count;
        
      } else {
        int lb = findBound(query, isSource, true, lo);
        if (lb >= 0) {
          int ub = findBound(query, isSource, false, lb, hi);
          assert ub >= 0 : String.format("%d %d %d %d %d", tgtId, lo, hi, lb, ub);
          return ub - lb + 1;
        }
      }
      
    } else {
      // Standard case
      int lb = findBound(query, isSource, true, 0);
      if (lb >= 0) {
        int ub = findBound(query, isSource, false, lb);
        assert ub >= 0;
        return ub - lb + 1;
      }
    }
    return 0;
  }
  
  /**
   * Return a sample of sentences from this suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples) {
    return sample(sourceQuery, maxSamples, 0, -1);
  }
  
  /**
   * Return a sample of sentences from this suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param exactMatch
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, boolean exactMatch) {
    return sample(sourceQuery, maxSamples, 0, -1, exactMatch);
  }


  /**
   * Return a sample of sentences from the suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, int minBound, int maxBound) {
    return sample(sourceQuery, maxSamples, minBound, maxBound, false);
  }
  
  /**
   * Return a sample of sentences from the suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @param exactMatch
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, int minBound, int maxBound, boolean exactMatch) {
    if (sourceQuery.length == 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int lb = maxBound > minBound ? findBound(sourceQuery, true, true, minBound, maxBound) :
      findBound(sourceQuery, true, true, minBound);
    if (lb < 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int ub = maxBound > lb ? findBound(sourceQuery, true, false, lb, maxBound) :
      findBound(sourceQuery, true, false, lb);
    assert ub >= 0;
    int numHits = ub - lb + 1;
    int stepSize = (numHits < maxSamples) ? 1 : numHits / maxSamples;
    assert stepSize > 0;
    // Stratified sample through the list of positions
    List<SentencePair> samples = new ArrayList<>(maxSamples);
    for (int i = lb; i <= ub && samples.size() < maxSamples; i += stepSize) {
      SentencePair sp = new SentencePair(srcSuffixArray[i]);
      if(!exactMatch || sp.sourceLength() == sourceQuery.length) samples.add(sp);
    }
    return new SuffixArraySample(samples, lb, ub);
  }
  
  /**
   * Return a sample from the target-side. Optimizations for pre-initializing the search
   * are not supported.
   * 
   * @param targetQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @return
   */
  public SuffixArraySample sampleTarget(final int[] targetQuery, int maxSamples) {
    if (targetQuery.length == 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int lb = findBound(targetQuery, false, true, 0);
    if (lb < 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int ub = findBound(targetQuery, false, false, lb);
    assert ub >= 0;
    int numHits = ub - lb + 1;
    int stepSize = (numHits < maxSamples) ? 1 : numHits / maxSamples;
    assert stepSize > 0;
    // Stratified sample through the list of positions
    List<SentencePair> samples = new ArrayList<>(maxSamples);
    for (int i = lb; i <= ub && samples.size() < maxSamples; i += stepSize) {
      samples.add(new SentencePair(tgtSuffixArray[i], true));
    }
    return new SuffixArraySample(samples, lb, ub);
  }

  /**
   * A sampled sentence with an associated pointer to the left edge of
   * the query sequence.
   * 
   * @author Spence Green
   *
   */
  public class SentencePair {
    
    public final int wordPosition;
    
    // TODO(spenceg) The character offset would yield a sentence id for e.g., bitext tuning.
//    public final int sentenceId;
    
    public final int srcStartInclusive;
    private final int srcEndExclusive;
    private final int tgtStartInclusive;
    private final int tgtEndExclusive;
    
    private SentencePair(int corpusPosition) {
      // Find source span
      int j = corpusPosition;
      assert srcBitext[j] >= 0;
      // Walk forward
      while (srcBitext[j] >= 0) j++;
      srcEndExclusive = j;
      // Walk backward
      j = corpusPosition - 1;
      while (j >= 0 && srcBitext[j] >= 0) j--;
      srcStartInclusive = j + 1;
      assert corpusPosition >= srcStartInclusive : String.format("%d %d", corpusPosition, srcStartInclusive);
      
      // Find the target span
      tgtStartInclusive = j == -1 ? 0 : fromSentenceOffset(srcBitext[j]) + 1;
      tgtEndExclusive = fromSentenceOffset(srcBitext[srcEndExclusive]);
      assert tgtStartInclusive < tgtEndExclusive : String.format("tgt: %d %d", tgtStartInclusive, 
          tgtEndExclusive);
      assert tgtEndExclusive > 0 : String.valueOf(tgtEndExclusive);
      assert fromSentenceOffset(tgtBitext[tgtEndExclusive]) == srcEndExclusive : String.format("%d %d", 
          fromSentenceOffset(tgtBitext[tgtEndExclusive]), srcEndExclusive);
      
      // Set the start of the query
      wordPosition = corpusPosition - srcStartInclusive;
    }
    
    // Actually, this always creates a sentence pair from a target example
    // But add the additional parameter so that there are two different constructors
    private SentencePair(int corpusPosition, boolean isTarget) {
      // Find source span
      int j = corpusPosition;
      assert tgtBitext[j] >= 0;
      // Walk forward
      while (tgtBitext[j] >= 0) j++;
      tgtEndExclusive = j;
      // Walk backward
      j = corpusPosition - 1;
      while (j >= 0 && tgtBitext[j] >= 0) j--;
      tgtStartInclusive = j + 1;
      assert corpusPosition >= tgtStartInclusive : String.format("%d %d", corpusPosition, tgtStartInclusive);
      
      // Find the target span
      srcStartInclusive = j == -1 ? 0 : fromSentenceOffset(tgtBitext[j]) + 1;
      srcEndExclusive = fromSentenceOffset(tgtBitext[tgtEndExclusive]);
      assert srcStartInclusive < srcEndExclusive : String.format("tgt: %d %d", srcStartInclusive, 
          srcEndExclusive);
      assert srcEndExclusive > 0 : String.valueOf(srcEndExclusive);
      assert fromSentenceOffset(srcBitext[srcEndExclusive]) == tgtEndExclusive : String.format("%d %d", 
          fromSentenceOffset(srcBitext[srcEndExclusive]), tgtEndExclusive);
      
      // Set the start of the query
      wordPosition = corpusPosition - tgtStartInclusive;
    }
    
    public int sourceLength() {
      return srcEndExclusive - srcStartInclusive;
    }
    
    public int targetLength() {
      return tgtEndExclusive - tgtStartInclusive;
    }
    
    public int source(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < 0 || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return srcBitext[bitextPos];
    }
    
    public int target(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return tgtBitext[bitextPos];
    }
    
    public int[] f2e(int startInclusive, int endExclusive) {
      if (startInclusive >= endExclusive) throw new IllegalArgumentException();
      int bitextStartInclusive = srcStartInclusive + startInclusive;
      int bitextEndExclusive = srcStartInclusive + endExclusive;
      if (bitextStartInclusive < srcStartInclusive || bitextEndExclusive > srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return Arrays.copyOfRange(f2e, bitextStartInclusive, bitextEndExclusive);
    }
    
    public int[] f2e(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < srcStartInclusive || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return AlignedSentence.expand(f2e[bitextPos]);
    }
    
    public int[] e2f(int startInclusive, int endExclusive) {
      if (startInclusive >= endExclusive) throw new IllegalArgumentException();
      int bitextStartInclusive = tgtStartInclusive + startInclusive;
      int bitextEndExclusive = tgtStartInclusive + endExclusive;
      if (bitextStartInclusive < tgtStartInclusive || bitextEndExclusive > tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return Arrays.copyOfRange(e2f, bitextStartInclusive, bitextEndExclusive);
    }
    
    public int[] e2f(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return AlignedSentence.expand(e2f[bitextPos]);
    }
    
    public boolean isSourceUnaligned(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < srcStartInclusive || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return f2e[bitextPos] == 0;
    }
    
    public boolean isTargetUnaligned(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return e2f[bitextPos] == 0;
    }
    
    public ParallelSuffixArrayEntry getParallelEntry() {
      return new ParallelSuffixArrayEntry(this, vocabulary);
    }
    
    @Override
    public String toString() {
      return this.getParallelEntry().toString();
    }
  }
  
  /**
   * A struct to hold the result of a sample of a suffix array.
   * 
   * @author Spence Green
   *
   */
  public static class SuffixArraySample {
    public final List<SentencePair> samples;
    public final int lb;
    public final int ub;
    public SuffixArraySample(List<SentencePair> q, int lb, int ub) {
      this.samples = q;
      this.lb = lb;
      this.ub = ub;
    }
    public int size() { return samples.size(); }
    @Override
    public String toString() {
      return String.format("bounds: %d/%d size: %d", lb, ub, samples.size());
    }
  }
}