java source code of AvroUtils

/**
 * Copyright 2014 LinkedIn Corp. All rights reserved.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package com.linkedin.mapred;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Logger;


public class AvroUtils
{
  private static final Logger _log = Logger.getLogger(AvroUtils.class);
  /**
   * Adds all subdirectories under a root path to the input format.
   * 
   * @param conf The JobConf.
   * @param path The root path.
   * @throws IOException
   */
  public static void addAllSubPaths(JobConf conf, Path path) throws IOException
  {
    if (shouldPathBeIgnored(path)) 
    {
      throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path));
    }

    final FileSystem fs = path.getFileSystem(conf);

    if(fs.exists(path)) 
    {
      for (FileStatus status : fs.listStatus(path)) 
      {
        if (! shouldPathBeIgnored(status.getPath())) 
        {
          if (status.isDir()) 
          {
            addAllSubPaths(conf, status.getPath());
          }
          else 
          {
            AvroInputFormat.addInputPath(conf, status.getPath());
          }
        }
      }
    }
  }
  /**
   * Enumerates all the files under a given path.
   * 
   * @param conf The JobConf.
   * @param basePath The base path.
   * @return A list of files found under the base path.
   * @throws IOException
   */
  public static List<Path> enumerateFiles(JobConf conf, Path basePath) throws IOException
  {
    if (shouldPathBeIgnored(basePath)) 
    {
      throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", basePath));
    }
    List<Path> paths = new ArrayList<Path>();
    FileSystem fs = basePath.getFileSystem(conf);
    
    if (!fs.exists(basePath))
    {
      return Collections.emptyList();
    }
    
    for (FileStatus s: fs.listStatus(basePath))
    {
      if (! shouldPathBeIgnored(s.getPath())) 
      {
        if (s.isDir())
        {
          paths.addAll(enumerateFiles(conf, s.getPath()));
        }
        else
        {
          paths.add(s.getPath());
        }
      }
    }
    return paths;
  }
  /**
   * Check if the path should be ignored. Currently only paths with "_log" are ignored.
   * 
   * @param path
   * @return
   * @throws IOException
   */
  public static boolean shouldPathBeIgnored(Path path) throws IOException
  {
    return path.getName().startsWith("_");
  }
  
  /**
   * Loads the schema from an Avro data file.
   * 
   * @param conf The JobConf.
   * @param path The path to the data file.
   * @return The schema read from the data file's metadata.
   * @throws IOException
   */
  public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException
  {
    FileSystem fs = path.getFileSystem(new Configuration());
    FSDataInputStream dataInputStream = fs.open(path);
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
    return dataFileStream.getSchema();
  }
  
  /**
   * Given a path to an output folder, it finds the existing "*.avro" files and adds 
   * them as cache files to be distributed. Throws an exception if no files are found/added.
   * 
   * @param conf Job configuration
   * @param outPath The path to the hdfs directory that has part files to cache
   * @throws Exception If no file is found at outPath throws a RuntimeException 
   */
  public static void addAvroCacheFiles(JobConf conf, Path outPath) throws Exception
  {
     FileStatus[] partFiles = getAvroPartFiles(conf, outPath);
     if (partFiles.length == 0)
     {      
       throw new RuntimeException("DistributedCacheFileUtils: No (part) file is found to cache at location:" + outPath );
     }
     
     for (FileStatus partFile : partFiles)
     {
       // add the file and set fileRead to true, since we have read at least one file
       DistributedCache.addCacheFile(partFile.getPath().toUri(), conf);
     }
   }
  
  public static FileStatus[] getAvroPartFiles(JobConf conf, Path outPath) throws IOException
  {
    Path outputPath = outPath;
    FileSystem fileSystem = outputPath.getFileSystem(conf);

    FileStatus[] partFiles = fileSystem.listStatus(outputPath, new PathFilter()
    {
      @Override
      public boolean accept(Path path)
      {
        if (path.getName().endsWith(".avro"))
        {
          return true;
        }
        return false;
      }
    });
    
    return partFiles;
  }
  /**
   * Obtain the avro input schema from data
   * @param conf
   * @return
   * @throws IOException
   */
  public static Schema getAvroInputSchema(JobConf conf) throws IOException
  {
    Path[] paths = FileInputFormat.getInputPaths(conf);
    if (paths == null)
    {
      throw new IllegalStateException("input paths do not exist in jobConf!");
    }
    Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
    if (inputSchema == null)
    {
      throw new IllegalStateException("Input does not have schema info and/or input is missing.");
    }
    return inputSchema;
  }
  
  /**
   * Run an avro hadoop job with job conf
   * @param conf
   * @throws Exception
   */
  public static void runAvroJob(JobConf conf) throws Exception
  {
    Path[] inputPaths = AvroInputFormat.getInputPaths(conf);
    _log.info("Running hadoop job with input paths:");
    for (Path inputPath : inputPaths)
    {
      _log.info(inputPath);
    }
    _log.info("Output path="+AvroOutputFormat.getOutputPath(conf));
    Job job = new Job(conf);
    job.setJarByClass(AvroUtils.class);
    job.waitForCompletion(true);
  }
  
  /**
   * Obtain a DataFileStream given a conf and path
   * @param conf
   * @param path
   * @return
   * @throws IOException
   */
  public static DataFileStream<Object> getAvroDataStream(JobConf conf, Path path) throws IOException
  {
    FileSystem fs = path.getFileSystem(conf);
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(path);
    return new DataFileStream<Object>(hdfsInputStream, avroReader);
  }
  public static void addAvroCacheFilesAndSetTheProperty(JobConf conf,
                                                          Path inputPath,
                                                          String property) throws Exception
  {
    addAvroCacheFiles(conf, inputPath);
    conf.set(property, inputPath.toString());
  }
}