// -*- mode:Java; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*-

/**
 *
 * Licensed under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 *
 *
 * Implements the Hadoop FS interfaces to allow applications to store
 * files in Ceph.
 */
package org.apache.hadoop.fs.ceph;


import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.OutputStream;
import java.net.URI;
import java.net.InetAddress;
import java.util.EnumSet;
import java.lang.Math;
import java.util.ArrayList;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.fs.FsStatus;

import com.ceph.fs.CephFileAlreadyExistsException;
import com.ceph.fs.CephNotDirectoryException;
import com.ceph.fs.CephMount;
import com.ceph.fs.CephStat;
import com.ceph.fs.CephStatVFS;
import com.ceph.crush.Bucket;
import com.ceph.fs.CephFileExtent;


/**
 * Known Issues:
 *
 *   1. Per-file replication and block size are ignored.
 */
public class CephFileSystem extends FileSystem {
  private static final Log LOG = LogFactory.getLog(CephFileSystem.class);
  private URI uri;

  private Path workingDir;
  private CephFsProto ceph = null;
  private static final int CEPH_STRIPE_COUNT = 1;
  private TreeMap<Integer, String> datapools = null;

  /**
   * Create a new CephFileSystem.
   */
  public CephFileSystem() {
  }

  /**
   * Create a new CephFileSystem.
   */
  public CephFileSystem(Configuration conf) {
    setConf(conf);
  }

  /**
   * Create an absolute path using the working directory.
   */
  private Path makeAbsolute(Path path) {
    if (path.isAbsolute()) {
      return path;
    }
    return new Path(workingDir, path);
  }

  public URI getUri() {
    return uri;
  }

  @Override
  public void initialize(URI uri, Configuration conf) throws IOException {
    super.initialize(uri, conf);
    if (ceph == null) {
      ceph = new CephTalker(conf, LOG);
    }
    ceph.initialize(uri, conf);
    setConf(conf);
    this.uri = URI.create(uri.getScheme() + "://" + uri.getAuthority());
    this.workingDir = getHomeDirectory();
  }

  /**
   * Open a Ceph file and attach the file handle to an FSDataInputStream.
   * @param path The file to open
   * @param bufferSize Ceph does internal buffering; but you can buffer in
   *   the Java code too if you like.
   * @return FSDataInputStream reading from the given path.
   * @throws IOException if the path DNE or is a
   * directory, or there is an error getting data to set up the FSDataInputStream.
   */
  public FSDataInputStream open(Path path, int bufferSize) throws IOException {
    path = makeAbsolute(path);

    // throws filenotfoundexception if path is a directory
    int fd = ceph.open(path, CephMount.O_RDONLY, 0);

    /* get file size */
    CephStat stat = new CephStat();
    ceph.fstat(fd, stat);

    CephInputStream istream = new CephInputStream(getConf(), ceph, fd,
        stat.size, bufferSize);
    return new FSDataInputStream(istream);
  }

  /**
   * Close down the CephFileSystem. Runs the base-class close method
   * and then kills the Ceph client itself.
   */
  @Override
  public void close() throws IOException {
    super.close(); // this method does stuff, make sure it's run!
    ceph.shutdown();
  }

  /**
   * Get an FSDataOutputStream to append onto a file.
   * @param path The File you want to append onto
   * @param bufferSize Ceph does internal buffering but you can buffer in the Java code as well if you like.
   * @param progress The Progressable to report progress to.
   * Reporting is limited but exists.
   * @return An FSDataOutputStream that connects to the file on Ceph.
   * @throws IOException If the file cannot be found or appended to.
   */
  public FSDataOutputStream append(Path path, int bufferSize,
      Progressable progress) throws IOException {
    path = makeAbsolute(path);

    if (progress != null) {
      progress.progress();
    }

    int fd = ceph.open(path, CephMount.O_WRONLY|CephMount.O_APPEND, 0);

    if (progress != null) {
      progress.progress();
    }

    CephOutputStream ostream = new CephOutputStream(getConf(), ceph, fd,
        bufferSize);
    return new FSDataOutputStream(ostream, statistics);
  }

  public Path getWorkingDirectory() {
    return workingDir;
  }

  @Override
  public void setWorkingDirectory(Path dir) {
    workingDir = makeAbsolute(dir);
  }

  /**
   * Create a directory and any nonexistent parents. Any portion
   * of the directory tree can exist without error.
   * @param path The directory path to create
   * @param perms The permissions to apply to the created directories.
   * @return true if successful, false otherwise
   * @throws IOException if the path is a child of a file.
   */
  @Override
  public boolean mkdirs(Path path, FsPermission perms) throws IOException {
    path = makeAbsolute(path);

    boolean result = false;
    try {
      ceph.mkdirs(path, (int) perms.toShort());
      result = true;
    } catch (CephFileAlreadyExistsException e) {
      result = true;
    }

    return result;
  }

  /**
   * Create a directory and any nonexistent parents. Any portion
   * of the directory tree can exist without error. 
   * Apply umask from conf
   * @param f The directory path to create
   * @return true if successful, false otherwise
   * @throws IOException if the path is a child of a file.
   */
  @Override
  public boolean mkdirs(Path f) throws IOException {
    return mkdirs(f, FsPermission.getDirDefault().applyUMask(FsPermission.getUMask(getConf())));
  }

  /**
   * Get stat information on a file. This does not fill owner or group, as
   * Ceph's support for these is a bit different than HDFS'.
   * @param path The path to stat.
   * @return FileStatus object containing the stat information.
   * @throws FileNotFoundException if the path could not be resolved.
   */
  public FileStatus getFileStatus(Path path) throws IOException {
    path = makeAbsolute(path);

    CephStat stat = new CephStat();
    ceph.lstat(path, stat);

    FileStatus status = new FileStatus(stat.size, stat.isDir(),
          ceph.get_file_replication(path), stat.blksize, stat.m_time,
          stat.a_time, new FsPermission((short) stat.mode),
          System.getProperty("user.name"), null, path.makeQualified(this));

    return status;
  }

  /**
   * Get the FileStatus for each listing in a directory.
   * @param path The directory to get listings from.
   * @return FileStatus[] containing one FileStatus for each directory listing;
   *         null if path does not exist.
   */
  public FileStatus[] listStatus(Path path) throws IOException {
    path = makeAbsolute(path);

    if (isFile(path))
      return new FileStatus[] { getFileStatus(path) };

    String[] dirlist = ceph.listdir(path);
    if (dirlist != null) {
      FileStatus[] status = new FileStatus[dirlist.length];
      for (int i = 0; i < status.length; i++) {
        status[i] = getFileStatus(new Path(path, dirlist[i]));
      }
      return status;
    }
    else {
      throw new FileNotFoundException("File " + path + " does not exist.");
    }
  }

  @Override
  public void setPermission(Path path, FsPermission permission) throws IOException {
    path = makeAbsolute(path);
    ceph.chmod(path, permission.toShort());
  }

  @Override
  public void setTimes(Path path, long mtime, long atime) throws IOException {
    path = makeAbsolute(path);

    CephStat stat = new CephStat();
    int mask = 0;

    if (mtime != -1) {
      mask |= CephMount.SETATTR_MTIME;
      stat.m_time = mtime;
    }

    if (atime != -1) {
      mask |= CephMount.SETATTR_ATIME;
      stat.a_time = atime;
    }

    ceph.setattr(path, stat, mask);
  }

  /**
   * Get data pools from configuration.
   *
   * Package-private: used by unit tests
   */
  String[] getConfiguredDataPools() {
    String pool_list = getConf().get(
        CephConfigKeys.CEPH_DATA_POOLS_KEY,
        CephConfigKeys.CEPH_DATA_POOLS_DEFAULT);

    if (pool_list != null)
      return pool_list.split(",");

    return new String[0];
  }

  /**
   * Lookup pool size by name.
   *
   * Package-private: used by unit tests
   */
  int getPoolReplication(String pool_name) throws IOException {
    int pool_id = ceph.get_pool_id(pool_name);
    return ceph.get_pool_replication(pool_id);
  }

  /**
   * Select a data pool given the requested replication factor.
   */
  private String selectDataPool(Path path, int repl_wanted) throws IOException {
    /* map pool size -> pool name */
    TreeMap<Integer, String> pools = new TreeMap<Integer, String>();

    /*
     * Start with a mapping for the default pool. An error here would indicate
     * something bad, so we throw any exceptions. For configured pools we
     * ignore some errors.
     */
    int fd = ceph.__open(new Path("/"), CephMount.O_RDONLY, 0);
    String pool_name = ceph.get_file_pool_name(fd);
    ceph.close(fd);
    int replication = getPoolReplication(pool_name);
    pools.put(new Integer(replication), pool_name);

    /*
     * Insert extra data pools from configuration. Errors are logged (most
     * likely a non-existant pool), and a configured pool will override the
     * default pool.
     */
    String[] conf_pools = getConfiguredDataPools();
    for (String name : conf_pools) {
      try {
        replication = getPoolReplication(name);
        pools.put(new Integer(replication), name);
      } catch (IOException e) {
        LOG.warn("Error looking up replication of pool: " + name + ", " + e);
      }
    }

    /* Choose smallest entry >= target, or largest in map. */
    Map.Entry<Integer, String> entry = pools.ceilingEntry(new Integer(repl_wanted));
    if (entry == null)
      entry = pools.lastEntry();

    /* should always contain default pool */
    assert(entry != null);

    replication = entry.getKey().intValue();
    pool_name = entry.getValue();

    /* log non-exact match cases */
    if (replication != repl_wanted) {
      LOG.info("selectDataPool path=" + path + " pool:repl=" +
          pool_name + ":" + replication + " wanted=" + repl_wanted);
    }

    return pool_name;
  }

  /**
   * Create a new file and open an FSDataOutputStream that's connected to it.
   * @param path The file to create.
   * @param permission The permissions to apply to the file.
   * @param overwrite If true, overwrite any existing file with
	 * this name; otherwise don't.
   * @param bufferSize Ceph does internal buffering, but you can buffer
   *   in the Java code too if you like.
   * @param replication Replication factor. See documentation on the
   *   "ceph.data.pools" configuration option.
   * @param blockSize Ignored by Ceph. You can set client-wide block sizes
   * via the fs.ceph.blockSize param if you like.
   * @param progress A Progressable to report back to.
   * Reporting is limited but exists.
   * @return An FSDataOutputStream pointing to the created file.
   * @throws IOException if the path is an
   * existing directory, or the path exists but overwrite is false, or there is a
   * failure in attempting to open for append with Ceph.
   */
  public FSDataOutputStream create(Path path, FsPermission permission,
      boolean overwrite, int bufferSize, short replication, long blockSize,
      Progressable progress) throws IOException {

    path = makeAbsolute(path);

    boolean exists = exists(path);

    if (progress != null) {
      progress.progress();
    }

    int flags = CephMount.O_WRONLY | CephMount.O_CREAT;

    if (exists) {
      if (overwrite)
        flags |= CephMount.O_TRUNC;
      else
        throw new FileAlreadyExistsException();
    } else {
      Path parent = path.getParent();
      if (parent != null)
        if (!mkdirs(parent))
          throw new IOException("mkdirs failed for " + parent.toString());
    }

    if (progress != null) {
      progress.progress();
    }

    /* Sanity check. Ceph interface uses int for striping strategy */
    if (blockSize > Integer.MAX_VALUE) {
      blockSize = Integer.MAX_VALUE;
      LOG.info("blockSize too large. Rounding down to " + blockSize);
    }

    /*
     * If blockSize <= 0 then we complain. We need to explicitly check for the
     * < 0 case (as opposed to allowing Ceph to raise an exception) because
     * the ceph_open_layout interface accepts -1 to request Ceph-specific
     * defaults.
     */
    if (blockSize <= 0)
      throw new IllegalArgumentException("Invalid block size: " + blockSize);

    /*
     * Ceph may impose alignment restrictions on file layout. In this case we
     * check if the requested block size is aligned to the granularity of a
     * stripe unit used in the file system. When the block size is not aligned
     * we automatically adjust to the next largest multiple of stripe unit
     * granularity.
     */
    int su = ceph.get_stripe_unit_granularity();
    if (blockSize % su != 0) {
      long newBlockSize = blockSize - (blockSize % su) + su;
      LOG.debug("fix alignment: blksize " + blockSize + " new blksize " + newBlockSize);
      blockSize = newBlockSize;
    }

    /*
     * The default Ceph data pool is selected to store files unless a specific
     * data pool is provided when a file is created. Since a pool has a fixed
     * replication factor, in order to achieve a requested replication factor,
     * we must select an appropriate data pool to place the file into.
     */
    String datapool = selectDataPool(path, replication);
    int fd = ceph.open(path, flags, (int)permission.toShort(), (int)blockSize,
        CEPH_STRIPE_COUNT, (int)blockSize, datapool);

    if (progress != null) {
      progress.progress();
    }

    OutputStream ostream = new CephOutputStream(getConf(), ceph, fd,
        bufferSize);
    return new FSDataOutputStream(ostream, statistics);
  }

  /**
  * Opens an FSDataOutputStream at the indicated Path with write-progress
  * reporting. Same as create(), except fails if parent directory doesn't
  * already exist.
  * @param path the file name to open
  * @param permission
  * @param overwrite if a file with this name already exists, then if true,
  * the file will be overwritten, and if false an error will be thrown.
  * @param bufferSize the size of the buffer to be used.
  * @param replication required block replication for the file.
  * @param blockSize
  * @param progress
  * @throws IOException
  * @see #setPermission(Path, FsPermission)
  * @deprecated API only for 0.20-append
  */
  @Deprecated
  public FSDataOutputStream createNonRecursive(Path path, FsPermission permission,
      boolean overwrite,
      int bufferSize, short replication, long blockSize,
      Progressable progress) throws IOException {

    path = makeAbsolute(path);

    Path parent = path.getParent();

    if (parent != null) {
      CephStat stat = new CephStat();
      ceph.lstat(parent, stat); // handles FileNotFoundException case
      if (stat.isFile())
        throw new FileAlreadyExistsException(parent.toString());
    }

    return this.create(path, permission, overwrite,
        bufferSize, replication, blockSize, progress);
  }

  /**
   * Rename a file or directory.
   * @param src The current path of the file/directory
   * @param dst The new name for the path.
   * @return true if the rename succeeded, false otherwise.
   */
  @Override
  public boolean rename(Path src, Path dst) throws IOException {
    src = makeAbsolute(src);
    dst = makeAbsolute(dst);

    try {
      CephStat stat = new CephStat();
      ceph.lstat(dst, stat);
      if (stat.isDir())
        return rename(src, new Path(dst, src.getName()));
      return false;
    } catch (FileNotFoundException e) {}

    try {
      ceph.rename(src, dst);
    } catch (FileNotFoundException e) {
      throw e;
    } catch (Exception e) {
      return false;
    }

    return true;
  }

  /**
   * Get a BlockLocation object for each block in a file.
   *
   * @param file A FileStatus object corresponding to the file you want locations for.
   * @param start The offset of the first part of the file you are interested in.
   * @param len The amount of the file past the offset you are interested in.
   * @return A BlockLocation[] where each object corresponds to a block within
   * the given range.
   */
  @Override
  public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException {
    Path abs_path = makeAbsolute(file.getPath());

    int fh = ceph.open(abs_path, CephMount.O_RDONLY, 0);
    if (fh < 0) {
      LOG.error("getFileBlockLocations:got error " + fh + ", exiting and returning null!");
      return null;
    }

    ArrayList<BlockLocation> blocks = new ArrayList<BlockLocation>();

    long curPos = start;
    long endOff = curPos + len;
    do {
      CephFileExtent extent = ceph.get_file_extent(fh, curPos);

      int[] osds = extent.getOSDs();
      String[] names = new String[osds.length];
      String[] hosts = new String[osds.length];
      String[] racks = new String[osds.length];

      for (int i = 0; i < osds.length; i++) {
        InetAddress addr = ceph.get_osd_address(osds[i]);
        names[i] = addr.getHostAddress();

        /*
         * Grab the hostname and rack from the crush hierarchy. Current we
         * hard code the item types. For a more general treatment, we'll need
         * a new configuration option that allows users to map their custom
         * crush types to hosts and topology.
         */
        Bucket[] path = ceph.get_osd_crush_location(osds[i]);
        for (Bucket bucket : path) {
          String type = bucket.getType();
          if (type.compareTo("host") == 0)
            hosts[i] = bucket.getName();
          else if (type.compareTo("rack") == 0)
            racks[i] = bucket.getName();
        }
      }

      blocks.add(new BlockLocation(names, hosts, racks,
            extent.getOffset(), extent.getLength()));

      curPos += extent.getLength();
    } while(curPos < endOff);

    ceph.close(fh);

    BlockLocation[] locations = new BlockLocation[blocks.size()];
    locations = blocks.toArray(locations);

    return locations;
  }

  @Deprecated
	public boolean delete(Path path) throws IOException {
		return delete(path, false);
	}

  public boolean delete(Path path, boolean recursive) throws IOException {
    path = makeAbsolute(path);

    /* path exists? */
    FileStatus status;
    try {
      status = getFileStatus(path);
    } catch (FileNotFoundException e) {
      return false;
    }

    /* we're done if its a file */
    if (status.isFile()) {
      ceph.unlink(path);
      return true;
    }

    /* get directory contents */
    FileStatus[] dirlist = listStatus(path);
    if (dirlist == null)
      return false;

    if (!recursive && dirlist.length > 0)
      throw new IOException("Directory " + path.toString() + "is not empty.");

    for (FileStatus fs : dirlist) {
      if (!delete(fs.getPath(), recursive))
        return false;
    }

    ceph.rmdir(path);
    return true;
  }

  @Override
  public short getDefaultReplication() {
    return ceph.getDefaultReplication();
  }

  @Override
  public long getDefaultBlockSize() {
    return getConf().getLong(
        CephConfigKeys.CEPH_OBJECT_SIZE_KEY,
        CephConfigKeys.CEPH_OBJECT_SIZE_DEFAULT);
  }
  
  @Override
  public FsStatus getStatus(Path p) throws IOException {
	  CephStatVFS stat = new CephStatVFS();
	  ceph.statfs(p, stat);

	  FsStatus status = new FsStatus(stat.bsize * stat.blocks, 
			  	stat.bsize * (stat.blocks - stat.bavail),
			  	stat.bsize * stat.bavail);
	  return status;
  }

  @Override
  protected int getDefaultPort() {
    return getConf().getInt(
        CephConfigKeys.CEPH_PORT,
        CephConfigKeys.CEPH_PORT_DEFAULT);
  }

  }