package hip.ch3.avro;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.mapred.AvroAsTextInputFormat;
import org.apache.avro.mapred.AvroTextOutputFormat;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.Iterator;

public class AvroTextMapReduce {

  public static final String[] LINES = new String[]{
      "the quick brown fox jumps over the lazy dog",
      "the cow jumps over the moon",
      "the rain in spain falls mainly on the plains"
  };

  public static void writeLinesBytesFile(OutputStream os)
      throws IOException {
    DatumWriter<ByteBuffer>
        writer = new GenericDatumWriter<ByteBuffer>();
    DataFileWriter<ByteBuffer> out =
        new DataFileWriter<ByteBuffer>(writer);
    out.create(Schema.create(Schema.Type.BYTES), os);
    for (String line : LINES) {
      out.append(ByteBuffer.wrap(line.getBytes("UTF-8")));
    }
    out.close();
  }

  /**
   * Uses default mapper with no reduces for a map-only identity job.
   */
  public static void main(String... args) throws Exception {
    JobConf job = new JobConf();
    job.setJarByClass(AvroTextMapReduce.class);
    Path input = new Path(args[0]);
    Path output = new Path(args[1]);

    output.getFileSystem(job).delete(output, true);

    FileSystem hdfs = FileSystem.get(job);
    OutputStream os = hdfs.create(input);
    writeLinesBytesFile(os);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setInputFormat(AvroAsTextInputFormat.class);
    job.setOutputFormat(AvroTextOutputFormat.class);
    job.setOutputKeyClass(Text.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    JobClient.runJob(job);

    validateSortedFile(output.getFileSystem(job)
        .open(new Path(output, "part-00000.avro")));
  }

  public static class Mapper
      extends MapReduceBase implements
      org.apache.hadoop.mapred.Mapper<Text, Text, Text, Text> {
    @Override
    public void map(Text key, Text value,
                    OutputCollector<Text, Text> output,
                    Reporter reporter) throws IOException {
      output.collect(key, value);
    }
  }

  public static class Reducer
      extends MapReduceBase implements
      org.apache.hadoop.mapred.Reducer<Text, Text, Text, Text> {
    public void reduce(Text key, Iterator<Text> values,
                       OutputCollector<Text, Text> output,
                       Reporter reporter)
        throws IOException {
      while (values.hasNext()) {
        output.collect(key, values.next());
      }
    }
  }


  public static void validateSortedFile(InputStream is)
      throws Exception {
    DatumReader<ByteBuffer>
        reader = new GenericDatumReader<ByteBuffer>();
    DataFileStream<ByteBuffer> lines =
        new DataFileStream<ByteBuffer>(is, reader);

    for (ByteBuffer line : lines) {
      byte[] b = new byte[line.remaining()];
      line.get(b);
      System.out.println(new String(b, "UTF-8").trim());
    }

    is.close();
  }
}