java source code of WarcFileRecordReader

/**
 * A Hadoop record reader for reading Warc Records
 *
 * (C) 2009 - Carnegie Mellon University
 *
 * 1. Redistributions of this source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. The names "Lemur", "Indri", "University of Massachusetts",
 *    "Carnegie Mellon", and "lemurproject" must not be used to
 *    endorse or promote products derived from this software without
 *    prior written permission. To obtain permission, contact
 *    [email protected].
 *
 * 4. Products derived from this software may not be called "Lemur" or "Indri"
 *    nor may "Lemur" or "Indri" appear in their names without prior written
 *    permission of The Lemur Project. To obtain permission,
 *    contact [email protected].
 *
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * @author [email protected] (Mark J. Hoy)
 */

package edu.cmu.lemurproject;

import edu.cmu.lemurproject.WarcRecord;

import java.io.DataInputStream;
import java.io.IOException;
import java.net.URI;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.MultiFileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.ReflectionUtils;

/*
 * With minor change to allow processing of Common Crawl files stored on S3.
 */
public class WarcFileRecordReader<K extends WritableComparable, V extends Writable>  implements RecordReader<LongWritable, WritableWarcRecord> {
  public static final Log LOG = LogFactory.getLog(WarcFileRecordReader.class);

  private long recordNumber=1;

  private Path[] filePathList=null;
  private int currentFilePath=-1;

  private FSDataInputStream currentFile=null;
  private CompressionCodec compressionCodec=null;
  private DataInputStream compressionInput=null;
  private FileSystem fs=null;
  private long totalFileSize=0;
  private long totalNumBytesRead=0;

public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
    if (split instanceof FileSplit) {
      this.filePathList=new Path[1];
      this.filePathList[0]=((FileSplit)split).getPath();
    } else if (split instanceof MultiFileSplit) {
      this.filePathList=((MultiFileSplit)split).getPaths();
    } else {
      throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
    }

    // Use FileSystem.get to open Common Crawl URIs using the S3 protocol.
    URI uri = filePathList[0].toUri();
    this.fs = FileSystem.get(uri, conf);
    
    // get the total file sizes
    for (int i=0; i < filePathList.length; i++) {
      totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
    }

    Class<? extends CompressionCodec> codecClass=null;

    try {
      codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
      compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
    } catch (ClassNotFoundException cnfEx) {
      compressionCodec=null;
      LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
    }

    openNextFile();
  }

  private boolean openNextFile() {
    try {
      if (compressionInput!=null) {
        compressionInput.close();
      } else if (currentFile!=null) {
        currentFile.close();
      }
      currentFile=null;
      compressionInput=null;

      currentFilePath++;
      if (currentFilePath >= filePathList.length) { return false; }

      currentFile=this.fs.open(filePathList[currentFilePath]);

      // is the file gzipped?
      if ((compressionCodec!=null) && (filePathList[currentFilePath].getName().endsWith("gz"))) {
        compressionInput=new DataInputStream(compressionCodec.createInputStream(currentFile));
        LOG.info("Compression enabled");
      }

    } catch (IOException ex) {
      LOG.info("IOError opening " + filePathList[currentFilePath].toString() + " - message: " + ex.getMessage());
      return false;
    }
    return true;
  }

  public boolean next(LongWritable key, WritableWarcRecord value) throws IOException {
    DataInputStream whichStream=null;
    if (compressionInput!=null) {
      whichStream=compressionInput;
    } else if (currentFile!=null) {
      whichStream=currentFile;
    }

    if (whichStream==null) { return false; }

    WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream);
    if (newRecord==null) {
      // try advancing the file
      if (openNextFile()) {
        newRecord=WarcRecord.readNextWarcRecord(whichStream);
      }

      if (newRecord==null) { return false; }
    }

    totalNumBytesRead += (long)newRecord.getTotalRecordLength();
    newRecord.setWarcFilePath(filePathList[currentFilePath].toString());

    // now, set our output variables
    value.setRecord(newRecord);
    key.set(recordNumber);

    recordNumber++;
    return true;
  }

  public LongWritable createKey() {
    return new LongWritable();
  }

  public WritableWarcRecord createValue() {
    return new WritableWarcRecord();
  }

  public long getPos() throws IOException {
    return totalNumBytesRead;
  }

  public void close() throws IOException {
    totalNumBytesRead=totalFileSize;
    if (compressionInput!=null) {
      compressionInput.close();
    } else if (currentFile!=null) {
      currentFile.close();
    }
  }

  public float getProgress() throws IOException {
    if (compressionInput!=null) {
      if (filePathList.length==0) { return 1.0f; }
      // return which file - can't do extact byte matching
      return (float)currentFilePath / (float)(filePathList.length);
    }
    if (totalFileSize==0) { return 0.0f; }
    return (float)totalNumBytesRead/(float)totalFileSize;
  }

}