java source code of MapReduceGeneratorInputFormat

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mrql;

import java.io.*;
import java.util.Iterator;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;


/** the FileInputFormat for data generators: it creates HDFS files, where each file contains
 *  an (offset,size) pair that generates the range of values [offset,offset+size] */
final public class MapReduceGeneratorInputFormat extends MapReduceMRQLFileInputFormat {
    public static class GeneratorRecordReader extends RecordReader<MRContainer,MRContainer> {
        final long offset;
        final long size;
        long index;
        SequenceFile.Reader reader;

        public GeneratorRecordReader ( FileSplit split,
                                       TaskAttemptContext context ) throws IOException {
            Configuration conf = context.getConfiguration();
            Path path = split.getPath();
            FileSystem fs = path.getFileSystem(conf);
            reader = new SequenceFile.Reader(path.getFileSystem(conf),path,conf);
            MRContainer key = new MRContainer();
            MRContainer value = new MRContainer();
            reader.next(key,value);
            offset = ((MR_long)((Tuple)(value.data())).first()).get();
            size = ((MR_long)((Tuple)(value.data())).second()).get();
            index = -1;
        }

        public boolean nextKeyValue () throws IOException {
            index++;
            return index < size;
        }

        public MRContainer getCurrentKey () throws IOException {
            return new MRContainer(new MR_long(index));
        }

        public MRContainer getCurrentValue () throws IOException {
            return new MRContainer(new MR_long(offset+index));
        }

        public void close () throws IOException { reader.close(); }

        public float getProgress () throws IOException {
            return index / (float)size;
        }

        public void initialize ( InputSplit split, TaskAttemptContext context ) throws IOException { }
    }

    public RecordReader<MRContainer,MRContainer>
              createRecordReader ( InputSplit split, TaskAttemptContext context ) throws IOException {
        return new GeneratorRecordReader((FileSplit)split,context);
    }

    /** Insert all results from the generators stored in path into a Bag.
     *  The Bag is lazily constructed.
     * @param path the path directory that contains the generator data (offset,size)
     * @return a Bag that contains all data
     */
    public Bag materialize ( final Path path ) throws IOException {
        Configuration conf = Plan.conf;
        FileSystem fs = path.getFileSystem(conf);
        final SequenceFile.Reader reader = new SequenceFile.Reader(path.getFileSystem(conf),path,conf);
        final MRContainer key = new MRContainer();
        final MRContainer value = new MRContainer();
        return new Bag(new BagIterator () {
                long offset = 0;
                long size = 0;
                long i = 0;
                public boolean hasNext () {
                    if (++i >= offset+size)
                        try {
                            if (!reader.next(key,value))
                                return false;
                            offset = ((MR_long)((Tuple)(value.data())).first()).get();
                            size = ((MR_long)((Tuple)(value.data())).second()).get();
                            i = offset;
                        } catch (IOException e) {
                            throw new Error("Cannot collect values from a generator");
                        };
                    return true;
                }
                public MRData next () {
                    return new MR_long(i);
                }
            });
    }
}