org.apache.hadoop.mapred.MultiFileSplit Java Examples

The following examples show how to use org.apache.hadoop.mapred.MultiFileSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WarcFileRecordReader.java    From wikireverse with MIT License 5 votes vote down vote up
public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
  if (split instanceof FileSplit) {
    this.filePathList=new Path[1];
    this.filePathList[0]=((FileSplit)split).getPath();
  } else if (split instanceof MultiFileSplit) {
    this.filePathList=((MultiFileSplit)split).getPaths();
  } else {
    throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
  }

  // Use FileSystem.get to open Common Crawl URIs using the S3 protocol.
  URI uri = filePathList[0].toUri();
  this.fs = FileSystem.get(uri, conf);
  
  // get the total file sizes
  for (int i=0; i < filePathList.length; i++) {
    totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
  }

  Class<? extends CompressionCodec> codecClass=null;

  try {
    codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
    compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
  } catch (ClassNotFoundException cnfEx) {
    compressionCodec=null;
    LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
  }

  openNextFile();
}
 
Example #2
Source File: MultiFileWordCount.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public MultiFileLineRecordReader(Configuration conf, MultiFileSplit split)
  throws IOException {
  
  this.split = split;
  fs = FileSystem.get(conf);
  this.paths = split.getPaths();
  this.totLength = split.getLength();
  this.offset = 0;
  
  //open the first file
  Path file = paths[count];
  currentStream = fs.open(file);
  currentReader = new BufferedReader(new InputStreamReader(currentStream));
}
 
Example #3
Source File: MultiFileWordCount.java    From hadoop-book with Apache License 2.0 5 votes vote down vote up
public MultiFileLineRecordReader(Configuration conf, MultiFileSplit split)
        throws IOException {

    this.split = split;
    fs = FileSystem.get(conf);
    this.paths = split.getPaths();
    this.totLength = split.getLength();
    this.offset = 0;

    //open the first file
    Path file = paths[count];
    currentStream = fs.open(file);
    currentReader = new BufferedReader(new InputStreamReader(currentStream));
}
 
Example #4
Source File: MultiFileWordCount.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public MultiFileLineRecordReader(Configuration conf, MultiFileSplit split)
  throws IOException {
  
  this.split = split;
  fs = FileSystem.get(conf);
  this.paths = split.getPaths();
  this.totLength = split.getLength();
  this.offset = 0;
  
  //open the first file
  Path file = paths[count];
  currentStream = fs.open(file);
  currentReader = new BufferedReader(new InputStreamReader(currentStream));
}
 
Example #5
Source File: MultiFileWordCount.java    From RDFS with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader<WordOffset,Text> getRecordReader(InputSplit split
    , JobConf job, Reporter reporter) throws IOException {
  return new MultiFileLineRecordReader(job, (MultiFileSplit)split);
}
 
Example #6
Source File: MultiFileWordCount.java    From hadoop-book with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader<WordOffset, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    return new MultiFileLineRecordReader(job, (MultiFileSplit) split);
}
 
Example #7
Source File: MultiFileWordCount.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader<WordOffset,Text> getRecordReader(InputSplit split
    , JobConf job, Reporter reporter) throws IOException {
  return new MultiFileLineRecordReader(job, (MultiFileSplit)split);
}