package morfologik.fsa;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;

/**
 * This is a top abstract class for handling finite state automata. These
 * automata are arc-based, a design described in Jan Daciuk's <i>Incremental
 * Construction of Finite-State Automata and Transducers, and Their Use in the
 * Natural Language Processing</i> (PhD thesis, Technical University of Gdansk).
 */
public abstract class FSA implements Iterable<ByteBuffer> {
  /**
   * @return Returns the identifier of the root node of this automaton. Returns
   *         0 if the start node is also the end node (the automaton is empty).
   */
  public abstract int getRootNode();

  /**
   * @param node
   *          Identifier of the node.
   * @return Returns the identifier of the first arc leaving <code>node</code>
   *         or 0 if the node has no outgoing arcs.
   */
  public abstract int getFirstArc(int node);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns the identifier of the next arc after <code>arc</code> and
   *         leaving <code>node</code>. Zero is returned if no more arcs are
   *         available for the node.
   */
  public abstract int getNextArc(int arc);

  /**
   * @param node
   *          Identifier of the node.
   * @param label
   *          The arc's label.
   * @return Returns the identifier of an arc leaving <code>node</code> and
   *         labeled with <code>label</code>. An identifier equal to 0 means the
   *         node has no outgoing arc labeled <code>label</code>.
   */
  public abstract int getArc(int node, byte label);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Return the label associated with a given <code>arc</code>.
   */
  public abstract byte getArcLabel(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns <code>true</code> if the destination node at the end of
   *         this <code>arc</code> corresponds to an input sequence created when
   *         building this automaton.
   */
  public abstract boolean isArcFinal(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Returns <code>true</code> if this <code>arc</code> does not have a
   *         terminating node (@link {@link #getEndNode(int)} will throw an
   *         exception). Implies {@link #isArcFinal(int)}.
   */
  public abstract boolean isArcTerminal(int arc);

  /**
   * @param arc
   *          The arc's identifier.
   * @return Return the end node pointed to by a given <code>arc</code>.
   *         Terminal arcs (those that point to a terminal state) have no end
   *         node representation and throw a runtime exception.
   */
  public abstract int getEndNode(int arc);

  /**
   * @return Returns a set of flags for this FSA instance.
   */
  public abstract Set<FSAFlags> getFlags();

  /**
   * @param node
   *          Identifier of the node.
   * @return Calculates and returns the number of arcs of a given node.
   */
  public int getArcCount(int node) {
    int count = 0;
    for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
      count++;
    }
    return count;
  }

  /**
   * @param node
   *          Identifier of the node.
   * 
   * @return Returns the number of sequences reachable from the given state if
   *         the automaton was compiled with {@link FSAFlags#NUMBERS}. The size
   *         of the right language of the state, in other words.
   * 
   * @throws UnsupportedOperationException
   *           If the automaton was not compiled with {@link FSAFlags#NUMBERS}.
   *           The value can then be computed by manual count of
   *           {@link #getSequences}.
   */
  public int getRightLanguageCount(int node) {
    throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
  }

  /**
   * Returns an iterator over all binary sequences starting at the given FSA
   * state (node) and ending in final nodes. This corresponds to a set of
   * suffixes of a given prefix from all sequences stored in the automaton.
   * 
   * <p>
   * The returned iterator is a {@link ByteBuffer} whose contents changes on
   * each call to {@link Iterator#next()}. The keep the contents between calls
   * to {@link Iterator#next()}, one must copy the buffer to some other
   * location.
   * </p>
   * 
   * <p>
   * <b>Important.</b> It is guaranteed that the returned byte buffer is backed
   * by a byte array and that the content of the byte buffer starts at the
   * array's index 0.
   * </p>
   * 
   * @param node
   *          Identifier of the starting node from which to return subsequences.
   * @return An iterable over all sequences encoded starting at the given node.
   */
  public Iterable<ByteBuffer> getSequences(final int node) {
    if (node == 0) {
      return Collections.<ByteBuffer> emptyList();
    }

    return new Iterable<ByteBuffer>() {
      public Iterator<ByteBuffer> iterator() {
        return new ByteSequenceIterator(FSA.this, node);
      }
    };
  }

  /**
   * An alias of calling {@link #iterator} directly ({@link FSA} is also
   * {@link Iterable}).
   * 
   * @return Returns all sequences encoded in the automaton.
   */
  public final Iterable<ByteBuffer> getSequences() {
    return getSequences(getRootNode());
  }

  /**
   * Returns an iterator over all binary sequences starting from the initial FSA
   * state (node) and ending in final nodes. The returned iterator is a
   * {@link ByteBuffer} whose contents changes on each call to
   * {@link Iterator#next()}. The keep the contents between calls to
   * {@link Iterator#next()}, one must copy the buffer to some other location.
   * 
   * <p>
   * <b>Important.</b> It is guaranteed that the returned byte buffer is backed
   * by a byte array and that the content of the byte buffer starts at the
   * array's index 0.
   * </p>
   */
  public final Iterator<ByteBuffer> iterator() {
    return getSequences().iterator();
  }

  /**
   * Visit all states. The order of visiting is undefined. This method may be
   * faster than traversing the automaton in post or preorder since it can scan
   * states linearly. Returning false from {@link StateVisitor#accept(int)}
   * immediately terminates the traversal.
   * 
   * @param v Visitor to receive traversal calls.
   * @param <T> A subclass of {@link StateVisitor}. 
   * @return Returns the argument (for access to anonymous class fields).
   */
  public <T extends StateVisitor> T visitAllStates(T v) {
    return visitInPostOrder(v);
  }

  /**
   * Same as {@link #visitInPostOrder(StateVisitor, int)}, starting from root
   * automaton node.
   * 
   * @param v Visitor to receive traversal calls.
   * @param <T> A subclass of {@link StateVisitor}. 
   * @return Returns the argument (for access to anonymous class fields).
   */
  public <T extends StateVisitor> T visitInPostOrder(T v) {
    return visitInPostOrder(v, getRootNode());
  }

  /**
   * Visits all states reachable from <code>node</code> in postorder. Returning
   * false from {@link StateVisitor#accept(int)} immediately terminates the
   * traversal.
   * 
   * @param v Visitor to receive traversal calls.
   * @param <T> A subclass of {@link StateVisitor}. 
   * @param node Identifier of the node.
   * @return Returns the argument (for access to anonymous class fields).
   */
  public <T extends StateVisitor> T visitInPostOrder(T v, int node) {
    visitInPostOrder(v, node, new BitSet());
    return v;
  }

  /** Private recursion. */
  private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) {
    if (visited.get(node))
      return true;
    visited.set(node);

    for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
      if (!isArcTerminal(arc)) {
        if (!visitInPostOrder(v, getEndNode(arc), visited))
          return false;
      }
    }

    return v.accept(node);
  }

  /**
   * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root
   * automaton node.
   * 
   * @param v Visitor to receive traversal calls.
   * @param <T> A subclass of {@link StateVisitor}. 
   * @return Returns the argument (for access to anonymous class fields).
   */
  public <T extends StateVisitor> T visitInPreOrder(T v) {
    return visitInPreOrder(v, getRootNode());
  }

  /**
   * Visits all states in preorder. Returning false from
   * {@link StateVisitor#accept(int)} skips traversal of all sub-states of a
   * given state.
   * 
   * @param v Visitor to receive traversal calls.
   * @param <T> A subclass of {@link StateVisitor}. 
   * @param node Identifier of the node.
   * @return Returns the argument (for access to anonymous class fields).
   */
  public <T extends StateVisitor> T visitInPreOrder(T v, int node) {
    visitInPreOrder(v, node, new BitSet());
    return v;
  }

  /**
   * @param in The input stream. 
   * @return Reads all remaining bytes from an input stream and returns
   * them as a byte array. 
   * @throws IOException Rethrown if an I/O exception occurs.
   */
  protected static final byte[] readRemaining(InputStream in) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    byte[] buffer = new byte[1024 * 8];
    int len;
    while ((len = in.read(buffer)) >= 0) {
      baos.write(buffer, 0, len);
    }
    return baos.toByteArray();
  }

  /** Private recursion. */
  private void visitInPreOrder(StateVisitor v, int node, BitSet visited) {
    if (visited.get(node)) {
      return;
    }
    visited.set(node);

    if (v.accept(node)) {
      for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
        if (!isArcTerminal(arc)) {
          visitInPreOrder(v, getEndNode(arc), visited);
        }
      }
    }
  }

  /**
   * A factory for reading automata in any of the supported versions.
   * 
   * @param stream
   *          The input stream to read automaton data from. The stream is not
   *          closed.
   * @return Returns an instantiated automaton. Never null.
   * @throws IOException
   *           If the input stream does not represent an automaton or is
   *           otherwise invalid.
   */
  public static FSA read(InputStream stream) throws IOException {
    final FSAHeader header = FSAHeader.read(stream);

    switch (header.version) {
      case FSA5.VERSION:
        return new FSA5(stream);
      case CFSA.VERSION:
        return new CFSA(stream);
      case CFSA2.VERSION:
        return new CFSA2(stream);
      default:
        throw new IOException(
            String.format(Locale.ROOT, "Unsupported automaton version: 0x%02x", header.version & 0xFF));
    }
  }

  /**
   * A factory for reading a specific FSA subclass, including proper casting.
   * 
   * @param stream
   *          The input stream to read automaton data from. The stream is not
   *          closed.
   * @param clazz A subclass of {@link FSA} to cast the read automaton to.
   * @param <T> A subclass of {@link FSA} to cast the read automaton to.
   * @return Returns an instantiated automaton. Never null.
   * @throws IOException
   *           If the input stream does not represent an automaton, is otherwise
   *           invalid or the class of the automaton read from the input stream
   *           is not assignable to <code>clazz</code>.
   */
  public static <T extends FSA> T read(InputStream stream, Class<? extends T> clazz) throws IOException {
    FSA fsa = read(stream);
    if (!clazz.isInstance(fsa)) {
      throw new IOException(String.format(Locale.ROOT, "Expected FSA type %s, but read an incompatible type %s.",
          clazz.getName(), fsa.getClass().getName()));
    }
    return clazz.cast(fsa);
  }
}