/** * Copyright 2015 Conductor, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * */ package com.conductor.s3; import java.io.IOException; import java.util.List; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3native.S3NativeFileSystemConfigKeys; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import com.amazonaws.services.s3.AmazonS3; /** * A {@link FileInputFormat} (MRV2 API) that is optimized for S3-based input, and supports recursive discovery of input * files given a single parent directory. * <p> * Job start-up time is much faster because this input format uses the {@link com.amazonaws.services.s3.AmazonS3} client * to discover job input files rather than the {@link org.apache.hadoop.fs.s3.S3FileSystem}. * <p> * This {@link FileInputFormat} supports adding just the top-level "directory" (i.e. a single S3 prefix) as file input; * it will recursively discover all files under the given prefix. This is <em>much</em> faster than adding individual * files to the job. * * @author cgreen * @see S3SequenceFileInputFormat * @see S3TextInputFormat * @author cgreen */ public abstract class S3OptimizedFileInputFormat<K, V> extends FileInputFormat<K, V> { @Override protected List<FileStatus> listStatus(final JobContext job) throws IOException { final Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } final long blockSize = job.getConfiguration().getLong(S3NativeFileSystemConfigKeys.S3_NATIVE_BLOCK_SIZE_KEY, S3NativeFileSystemConfigKeys.S3_NATIVE_BLOCK_SIZE_DEFAULT); final AmazonS3 s3Client = S3HadoopUtils.getS3Client(job.getConfiguration()); return S3InputFormatUtils.getFileStatuses(s3Client, blockSize, dirs); } /** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(final JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); final List<FileStatus> files = listStatus(job); return S3InputFormatUtils.convertToInputSplits(files, minSize, maxSize); } }