/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package org.anarres.parallelgzip;

import java.io.ByteArrayOutputStream;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.zip.CRC32;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;
import java.util.zip.GZIPOutputStream;
import javax.annotation.CheckForNull;
import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;

/**
 * A multi-threaded version of {@link GZIPOutputStream}.
 *
 * @author shevek
 */
public class ParallelGZIPOutputStream extends FilterOutputStream {

    private static final int GZIP_MAGIC = 0x8b1f;
    private static final int SIZE = 64 * 1024;

    @Nonnull
    private static Deflater newDeflater() {
        return new Deflater(Deflater.DEFAULT_COMPRESSION, true);
    }

    @Nonnull
    private static DeflaterOutputStream newDeflaterOutputStream(@Nonnull OutputStream out, @Nonnull Deflater deflater) {
        return new DeflaterOutputStream(out, deflater, 512, true);
    }

    /* Allow write into byte[] directly */
    private static class ByteArrayOutputStreamExposed extends ByteArrayOutputStream {

        public ByteArrayOutputStreamExposed(int size) {
            super(size);
        }

        public void writeTo(@Nonnull byte[] buf) throws IOException {
            System.arraycopy(this.buf, 0, buf, 0, count);
        }
    }

    private static class State {

        private final Deflater def = newDeflater();
        private final ByteArrayOutputStreamExposed buf = new ByteArrayOutputStreamExposed(SIZE + (SIZE >> 3));
        private final DeflaterOutputStream str = newDeflaterOutputStream(buf, def);
    }

    /** This ThreadLocal avoids the recycling of a lot of memory, causing lumpy performance. */
    private static final ThreadLocal<State> STATE = new ThreadLocal<State>() {
        @Override
        protected State initialValue() {
            return new State();
        }
    };

    private static class Block implements Callable<Block> {

        // private final int index;
        private byte[] buf = new byte[SIZE + (SIZE >> 3)];
        private int buf_length = 0;

        /*
         public Block(@Nonnegative int index) {
         this.index = index;
         }
         */
        // Only on worker thread
        @Override
        public Block call() throws IOException {
            // LOG.info("Processing " + this + " on " + Thread.currentThread());

            State state = STATE.get();
            // ByteArrayOutputStream buf = new ByteArrayOutputStream(in.length);   // Overestimate output size required.
            // DeflaterOutputStream def = newDeflaterOutputStream(buf);
            state.def.reset();
            state.buf.reset();
            state.str.write(buf, 0, buf_length);
            state.str.flush();

            // int in_length = buf_length;
            int out_length = state.buf.size();
            if (out_length > buf.length)
                this.buf = new byte[out_length];
            // System.out.println("Compressed " + in_length + " to " + out_length + " bytes.");
            this.buf_length = out_length;
            state.buf.writeTo(buf);

            // return Arrays.copyOf(in, in_length);
            return this;
        }

        @Override
        public String toString() {
            return "Block" /* + index */ + "(" + buf_length + "/" + buf.length + " bytes)";
        }
    }

    @Nonnegative
    private static int getThreadCount(@Nonnull ExecutorService executor) {
        if (executor instanceof ThreadPoolExecutor)
            return ((ThreadPoolExecutor) executor).getMaximumPoolSize();
        return Runtime.getRuntime().availableProcessors();
    }

    // TODO: Share, daemonize.
    private final ExecutorService executor;
    private final CRC32 crc = new CRC32();
    private final int emitQueueSize;
    private final BlockingQueue<Future<Block>> emitQueue;
    @Nonnull
    private Block block = new Block();
    @CheckForNull
    private Block freeBlock = null;
    /** Used as a sentinel for 'closed'. */
    private long bytesWritten = 0;

    // Master thread only
    @Deprecated // Doesn't really use the given number of threads.
    public ParallelGZIPOutputStream(@Nonnull OutputStream out, @Nonnull ExecutorService executor, @Nonnegative int nthreads) throws IOException {
        super(out);
        this.executor = executor;
        // Some blocks compress faster than others; allow a long enough queue to keep all CPUs busy at least for a bit.
        this.emitQueueSize = nthreads * 3;
        this.emitQueue = new ArrayBlockingQueue<Future<Block>>(emitQueueSize);
        writeHeader();
    }

    /**
     * Creates a ParallelGZIPOutputStream
     * using {@link ParallelGZIPEnvironment#getSharedThreadPool()}.
     *
     * @param out the eventual output stream for the compressed data.
     * @throws IOException if it all goes wrong.
     */
    @Deprecated // Doesn't really use the given number of threads.
    public ParallelGZIPOutputStream(@Nonnull OutputStream out, @Nonnegative int nthreads) throws IOException {
        this(out, ParallelGZIPEnvironment.getSharedThreadPool(), nthreads);
    }

    public ParallelGZIPOutputStream(@Nonnull OutputStream out, @Nonnull ExecutorService executor) throws IOException {
        this(out, executor, getThreadCount(executor));
    }

    /**
     * Creates a ParallelGZIPOutputStream
     * using {@link ParallelGZIPEnvironment#getSharedThreadPool()}.
     *
     * @param out the eventual output stream for the compressed data.
     * @throws IOException if it all goes wrong.
     */
    public ParallelGZIPOutputStream(@Nonnull OutputStream out) throws IOException {
        this(out, ParallelGZIPEnvironment.getSharedThreadPool());
    }

    /*
     * @see http://www.gzip.org/zlib/rfc-gzip.html#file-format
     */
    private void writeHeader() throws IOException {
        out.write(new byte[]{
            (byte) GZIP_MAGIC, // ID1: Magic number (little-endian short)
            (byte) (GZIP_MAGIC >> 8), // ID2: Magic number (little-endian short)
            Deflater.DEFLATED, // CM: Compression method
            0, // FLG: Flags (byte)
            0, 0, 0, 0, // MTIME: Modification time (int)
            0, // XFL: Extra flags
            3 // OS: Operating system (3 = Linux)
        });
    }

    // Master thread only
    @Override
    public void write(int b) throws IOException {
        byte[] single = new byte[1];
        single[0] = (byte) (b & 0xFF);
        write(single);
    }

    // Master thread only
    @Override
    public void write(byte[] b) throws IOException {
        write(b, 0, b.length);
    }

    // Master thread only
    @Override
    public void write(byte[] b, int off, int len) throws IOException {
        crc.update(b, off, len);
        bytesWritten += len;

        while (len > 0) {
            final byte[] blockBuf = block.buf;
            // assert block.in_length < block.in.length
            int capacity = SIZE - block.buf_length; // Make sure we don't grow the block buf repeatedly.
            if (len >= capacity) {
                System.arraycopy(b, off, blockBuf, block.buf_length, capacity);
                block.buf_length += capacity;   // == block.in.length
                off += capacity;
                len -= capacity;
                submit();
            } else {
                System.arraycopy(b, off, blockBuf, block.buf_length, len);
                block.buf_length += len;
                // off += len;
                // len = 0;
                break;
            }
        }
    }

    // Master thread only
    private void submit() throws IOException {
        emitUntil(emitQueueSize - 1);
        emitQueue.add(executor.submit(block));
        Block b = freeBlock;
        if (b != null)
            freeBlock = null;
        else
            b = new Block();
        block = b;
    }

    // Emit If Available - submit always
    // Emit At Least one - submit when executor is full
    // Emit All Remaining - flush(), close()
    // Master thread only
    private void tryEmit() throws IOException, InterruptedException, ExecutionException {
        for (;;) {
            Future<Block> future = emitQueue.peek();
            // LOG.info("Peeked future " + future);
            if (future == null)
                return;
            if (!future.isDone())
                return;
            // It's an ordered queue. This MUST be the same element as above.
            Block b = emitQueue.remove().get();
            // System.out.println("Chance-emitting block " + b);
            out.write(b.buf, 0, b.buf_length);
            b.buf_length = 0;
            freeBlock = b;
        }
    }

    // Master thread only
    /** Emits any opportunistically available blocks. Furthermore, emits blocks until the number of executing tasks is less than taskCountAllowed. */
    private void emitUntil(@Nonnegative int taskCountAllowed) throws IOException {
        try {
            while (emitQueue.size() > taskCountAllowed) {
                // LOG.info("Waiting for taskCount=" + emitQueue.size() + " -> " + taskCountAllowed);
                Block b = emitQueue.remove().get();  // Valid because emitQueue.size() > 0
                // System.out.println("Force-emitting block " + b);
                out.write(b.buf, 0, b.buf_length);  // Blocks until this task is done.
                b.buf_length = 0;
                freeBlock = b;
            }
            // We may have achieved more opportunistically available blocks
            // while waiting for a block above. Let's emit them here.
            tryEmit();
        } catch (ExecutionException e) {
            throw new IOException(e);
        } catch (InterruptedException e) {
            throw new InterruptedIOException();
        }
    }

    // Master thread only
    @Override
    public void flush() throws IOException {
        // LOG.info("Flush: " + block);
        if (block.buf_length > 0)
            submit();
        emitUntil(0);
        super.flush();
    }

    // Master thread only
    @Override
    public void close() throws IOException {
        // LOG.info("Closing: bytesWritten=" + bytesWritten);
        if (bytesWritten >= 0) {
            flush();

            newDeflaterOutputStream(out, newDeflater()).finish();

            ByteBuffer buf = ByteBuffer.allocate(8);
            buf.order(ByteOrder.LITTLE_ENDIAN);
            // LOG.info("CRC is " + crc.getValue());
            buf.putInt((int) crc.getValue());
            buf.putInt((int) (bytesWritten % 4294967296L));
            out.write(buf.array()); // allocate() guarantees a backing array.
            // LOG.info("trailer is " + Arrays.toString(buf.array()));

            out.flush();
            out.close();

            bytesWritten = Integer.MIN_VALUE;
            // } else {
            // LOG.warn("Already closed.");

            freeBlock = null;
        }
    }
}