package com.martinkl.warc.mapred;

import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import com.martinkl.warc.WARCFileWriter;
import com.martinkl.warc.WARCWritable;

/**
 * Hadoop OutputFormat for mapred jobs ('old' API) that want to write data to WARC files.
 *
 * Usage:
 *
 * ```java
 * JobConf job = new JobConf(getConf());
 * job.setOutputFormat(WARCOutputFormat.class);
 * job.setOutputKeyClass(NullWritable.class);
 * job.setOutputValueClass(WARCWritable.class);
 * FileOutputFormat.setCompressOutput(job, true);
 * ```
 *
 * The tasks generating the output (usually the reducers, but may be the mappers if there
 * are no reducers) should use `NullWritable.get()` as the output key, and the
 * {@link WARCWritable} as the output value.
 */
public class WARCOutputFormat extends FileOutputFormat<NullWritable, WARCWritable> {

    /**
     * Creates a new output file in WARC format, and returns a RecordWriter for writing to it.
     */
    @Override
    public RecordWriter<NullWritable, WARCWritable> getRecordWriter(FileSystem fs, JobConf job, String filename,
                                                                    Progressable progress) throws IOException {
        return new WARCWriter(job, filename, progress);
    }

    private static class WARCWriter implements RecordWriter<NullWritable, WARCWritable> {
        private final WARCFileWriter writer;

        public WARCWriter(JobConf job, String filename, Progressable progress) throws IOException {
            CompressionCodec codec = getCompressOutput(job) ? WARCFileWriter.getGzipCodec(job) : null;
            Path workFile = FileOutputFormat.getTaskOutputPath(job, filename);
            this.writer = new WARCFileWriter(job, codec, workFile, progress);
        }

        @Override
        public void write(NullWritable key, WARCWritable value) throws IOException {
            writer.write(value);
        }

        @Override
        public void close(Reporter reporter) throws IOException {
            writer.close();
        }
    }
}